My items.py file goes like this:
from scrapy.item import Item, Field
class SpiItem(Item):
title = Field()
lat = Field()
lng = Field()
add = Field()
and the spider is:
import scrapy
import re
from spi.items import SpiItem
class HdfcSpider(scrapy.Spider):
name = "hdfc"
allowed_domains = ["hdfc.com"]
start_urls = ["http://hdfc.com/branch-locator"]
def parse(self,response):
addresses = response.xpath('//script')
for sel in addresses:
item = SpiItem()
item['title'] = sel.xpath('//script[@type="text/javascript"][1]').re('(?<="title":).+(?=")')
item['lat'] = sel.xpath('//script[@type="text/javascript"][1]').re('(?<="latitude":).+(?=")')
item['lng'] = sel.xpath('//script[@type="text/javascript"][1]').re('(?<="longitude":).+(?=")')
item['add'] = sel.xpath('//script[@type="text/javascript"][1]').re('(?<="html":).+(?=")')
yield item
The whole javascript code, on viewing page source, is written inside: //html/body/table/tbody/tr[348]/td[2]
.
Why is my code not working? I want to extract just the four fields mentioned in the items file.
Instead of extracting field by field using regular expressions, extract the complete locations
object, load it via json.loads()
and extract the desired data from the Python dictionary you'll get:
def parse(self,response):
pattern = re.compile(r"var locations= ({.*?});", re.MULTILINE | re.DOTALL)
locations = response.xpath('//script[contains(., "var locations")]/text()').re(pattern)[0]
locations = json.loads(locations)
for title, data in locations.iteritems():
print title
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With