So I have a crawler which works well for extracting information about gigs. However, in the information I scrape is a url displaying further information about the gig listed such as the style of music. How do I scrape inside that url and carry on scraping everything else?
Here ismy code. Any help really appreciated.
import scrapy # Import required libraries.
from scrapy.selector import HtmlXPathSelector # Allows for path detection in a websites code.
from scrapy.spider import BaseSpider # Used to create a simple spider to extract data.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor # Needed for the extraction of href links in HTML to crawl further pages.
from scrapy.contrib.spiders import CrawlSpider # Needed to make the crawl spider.
from scrapy.contrib.spiders import Rule # Allows specified rules to affect what the link
from urlparse import urlparse
import soundcloud
import mysql.connector
import requests
import time
from datetime import datetime
from tutorial.items import TutorialItem
genre = ["Dance",
"Festivals",
"Rock/pop"
]
class AllGigsSpider(CrawlSpider):
name = "allGigs" # Name of the Spider. In command promt, when in the correct folder, enter "scrapy crawl Allgigs".
allowed_domains = ["www.allgigs.co.uk"] # Allowed domains is a String NOT a URL.
start_urls = [
#"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
#"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/tours-65.html"
]
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[@class="more"]'), # Search the start URL's for
callback="parse_item",
follow=True),
]
def parse_start_url(self, response):#http://stackoverflow.com/questions/15836062/scrapy-crawlspider-doesnt-crawl-the-first-landing-page
return self.parse_item(response)
def parse_item(self, response):
for info in response.xpath('//div[@class="entry vevent"]'):
item = TutorialItem() # Extract items from the items folder.
item ['table'] = "London"
item ['url'] = info.xpath('.//a[@class="url"]/@href').extract()
print item ['url']
item ['genres'] = info.xpath('.//li[@class="style"]//text() | ./parent::a[@class="url"]/preceding-sibling::li[@class="style"]//text').extract()
print item ['genres']
item ['artist'] = info.xpath('.//span[@class="summary"]//text()').extract() # Extract artist information.
item ['venue'] = info.xpath('.//span[@class="vcard location"]//text()').extract() # Extract artist information.
item ['borough'] = info.xpath('.//span[@class="adr"]//text()').extract() # Extract artist information.
item ['date'] = info.xpath('.//span[@class="dates"]//text()').extract() # Extract date information.
a, b, c = item["date"][0].split()
item['dateForm']=(datetime.strptime("{} {} {} {}".format(a,b.rstrip("ndthstr"),c,"2015"),"%a %d %b %Y").strftime("%Y,%m,%d"))
preview = ''.join(str(s)for s in item['artist'])
item ['genre'] = info.xpath('.//div[@class="header"]//text() | ./parent::div[@class="rows"]/preceding-sibling::div[@class="header"]//text()').extract()
client = soundcloud.Client(client_id='401c04a7271e93baee8633483510e263', client_secret='b6a4c7ba613b157fe10e20735f5b58cc', callback='http://localhost:9000/#/callback.html')
tracks = client.get('/tracks', q = preview, limit=1)
for track in tracks:
print track.id
item ['trackz'] = track.id
yield item
The a[@class="url"] is what I want to get into. The li[@class="style"] contains the information I require inside the url. many thanks
Heres an update on the situation. The code i have attempted here produces an assertion error. Bit perplexed by this...
item ['url'] = info.xpath('.//a[@class="url"]/@href').extract()
item ['url'] = ''.join(str(t) for t in item['url'])
yield Request (item['url'], callback='continue_item', meta={'item': item})
def countinue_item(self, response):
item = response.meta.get('item')
item['genres']=info.xpath('.//li[@class="style"]//text()').extract()
print item['genres']
return self.parse_parse_item(response)
I change the item['url'] to a string with the .join function. then in the continue_item i scrape inside the url (or at least its supposed to!) and return the result. However as mentioned, not working properly yet. Dont think its too far away
You need to continue crawling it with a new method, like:
from scrapy.http import Request
...
def parse_item(self, response):
...
yield Request(item['url'], callback=self.continue_item, meta={'item': item})
def continue_item(self, response):
item = response.meta.get('item')
...
yield item
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With