Passing arguments to callback functions with Scrapy, so can receive the arguments later crash

Question

I try to get this spider work and if request the components to be scraped separately it works, however when try to use Srapy callback function to receive the arguments later i get crashed. The goal is to craw over multiple pages and scrape data while write in output json file in format:

author | album | title | lyrics

the data for each is located on separate web pages, so that is why I'm tying to use Scrapy callback function to get that accomplished.

Also each of the above items are defined under Scrapy items.py as:

import scrapy

class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
    author = scrapy.Field()
    album = scrapy.Field()
    title = scrapy.Field()
    lyrics = scrapy.Field()

Spider Code start here:

import scrapy
import re
import json

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tutorial.items import TutorialItem


# urls class
class DomainSpider(scrapy.Spider):
    name = "domainspider"
    allowed_domains = ['www.domain.com']
    start_urls = [
        'http://www.domain.com',
    ]

    rules = (
        Rule(LinkExtractor(allow='www\.domain\.com/[A-Z][a-zA-Z_/]+$'), 
            'parse', follow=True,
        ),
    )

    # Parsing start here
    # crawling and scraping the links from menu list
    def parse(self, response):
        links = response.xpath('//html/body/nav[1]/div/ul/li/div/a/@href')

        for link in links:
            next_page_link = link.extract()
            if next_page_link:
                next_page = response.urljoin(next_page_link)
                yield scrapy.Request(next_page, callback=self.parse_artist_page)

    # crawling and scraping artist names and links
    def parse_artist_page(self, response):
        artist_links = response.xpath('//*/div[contains(@class, "artist-col")]/a/@href')
        author = response.xpath('//*/div[contains(@class, "artist-col")]/a/text()').extract()

        item = TutorialItem(author=author)

        for link in artist_links:
            next_page_link = link.extract()
            if next_page_link:
                next_page = response.urljoin(next_page_link)
                yield scrapy.Request(next_page, callback=self.parse_album_page)

                request.meta['author'] = item
                yield item
                return

    # crawling and scraping album names and links
    def parse_album_page(self, response):
        album_links = response.xpath('//*/div[contains(@id, "listAlbum")]/a/@href')
        album = response.xpath('//*/div[contains(@class, "album")]/b/text()').extract()

        item = TutorialItem(album=album)

        for link in album_links:
            next_page_link = link.extract()
            if next_page_link:
                next_page = response.urljoin(next_page_link)
                yield scrapy.Request(next_page, callback=self.parse_lyrics_page)

                request.meta['album'] = item
                yield item
                return

    # crawling and scraping titles and lyrics
    def parse_lyrics_page(self, response):
        title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
        lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())

        item = response.meta['author', 'album']
        item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
        yield item

The code crash when get to call back function:

request.meta['author'] = item
yield item
return

Can anyone help?

Krasimir Vatchinsky · Accepted Answer

I did found where was the problem, the way callback function was set by me, now works:

# crawling and scraping artist names and links
    def parse_artist_page(self, response):
        artist_links = response.xpath('//*/div[contains(@class, "artist-col")]/a/@href')
        author = response.xpath('//*/div[contains(@class, "artist-col")]/a/text()').extract()

        for link in artist_links:
            next_page_link = link.extract()
            if next_page_link:
                next_page = response.urljoin(next_page_link)
                request = scrapy.Request(next_page, callback=self.parse_album_page)
                request.meta['author'] = author
                return request

    # crawling and scraping album names and links
    def parse_album_page(self, response):
        author = response.meta.get('author')

        album_links = response.xpath('//*/div[contains(@id, "listAlbum")]/a/@href')
        album = response.xpath('//*/div[contains(@class, "album")]/b/text()').extract()


        for link in album_links:
            next_page_link = link.extract()
            if next_page_link:
                next_page = response.urljoin(next_page_link)
                request = scrapy.Request(next_page, callback=self.parse_lyrics_page)
                request.meta['author'] = author
                request.meta['album'] = album
                return request

    # crawling and scraping song titles and lyrics
    def parse_lyrics_page(self, response):
        author = response.meta.get('author')
        album = response.meta.get('album')

        title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
        lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())

        item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
        yield item

Passing arguments to callback functions with Scrapy, so can receive the arguments later crash

Tags:

python

scrapy

scrapy-spider

Krasimir Vatchinsky

1 Answers

Krasimir Vatchinsky

Recent Activity

Donate For Us

Passing arguments to callback functions with Scrapy, so can receive the arguments later crash

Tags:

python

scrapy

scrapy-spider

Krasimir Vatchinsky

1 Answers

Krasimir Vatchinsky

Related questions

Recent Activity

Donate For Us