I've tried copy and pasting the elements(xpath) of the site but returns no result.
can scrapy scrape data that is inside the iframe? if yes how and if not, what other things should be done? thanks!
rules = (Rule (SgmlLinkExtractor(deny = path_deny_base, restrict_xpaths=('*'))
, callback="parse", follow= True),
)
def parse(self, response):
yield(Request(url, callback = self.parse_iframe))
def parse_iframe(self, response):
#your code to scrape the content from iframe
#def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[2]/h1')
#//div[2]/h1
linker = hxs.select('//div[2]/div[10]/a[1]')
#//div[2]/div[10]/a[1]
loc_Con = hxs.select('//div[2]/div[1]/div[2]/span/span/span[1]') #//div[2]/div[1]/div[2]/span/span/span[1]
loc_Reg = hxs.select('//div[2]/div[1]/div[2]/span/span/span[2]') #/div[2]/div[1]/div[2]/span/span/span[2]
loc_Loc = hxs.select('//div[2]/div[1]/div[2]/span/span/span[3]') #/div[2]/div[1]/div[2]/span/span/span[3]
items = []
for titles in titles:
item = CraigslistSampleItem()
#item ["job_id"] = id.select('text()').extract()[0].strip()
item ["title"] = map(unicode.strip, titles.select('text()').extract()) #ok
item ["link"] = linker.select('@href').extract() #ok
item ["info"] = (response.url)
temp1 = loc_Con.select('text()').extract()
temp2 = loc_Reg.select('text()').extract()
temp3 = loc_Loc.select('text()').extract()
temp1 = temp1[0] if temp1 else ""
temp2 = temp2[0] if temp2 else ""
temp3 = temp3[0] if temp3 else ""
item["code"] = "{0}-{1}-{2}".format(temp1, temp2, temp3)
items.append(item)
return(items)
Scrapy can not scrape the content from the iframe. Rather you make request to iframe URL, like:
def parse(self, response):
yield(Request(url, callback = self.parse_iframe))
def parse_iframe(self, response):
#your code to scrape the content from iframe
Where, url should be iframe url, for example https://career-meridia....../jobs)
Edit:
Replace the url with red-underlined part. Edit2: Make sure you have passed every param that needed by iframe url. Otherwise you will get nothing. If it is post method, you have to pass all the post param.
This is the way i'm doing it. First get the iframe url, then call parse again on it.
urls = response.css('iframe::attr(src)').extract()
for url in urls :
yield scrapy.Request(url....)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With