I can't figure out why Scrapy is crawling the first page but not following the links to crawl the subsequent pages. It must be something to do with the Rules. Much appreciated. Thank you!
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistItem
class MySpider(CrawlSpider):
name = "craig"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/acc/"]
rules = (Rule (SgmlLinkExtractor(allow=("index100\.html", ),restrict_xpaths=('//p[@id="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/@href").extract()
items.append(item)
return(items)
spider = MySpider()
Craig uses index100
,index200
,index300
... for next pages, max is index900
.
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[@id="nextpage"]',))
, callback="parse_items", follow= True),
)
works for me.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With