I'm trying to get all the data from this website in order to later use it in some model training project (ML).
I've chosen to do it by using Scrapy + Python 3.7. So far so good. I've set up my Scrapy project structure and I started working on the scraper. In order to do this, I created some steps that need to be followed in order to accordingly get the data that I need.
Product
class from the code that I'll paste below)I've tried to reproduce the above by using the following piece of code:
import json
import re
import scrapy
PRODUCTS_XPATH = "//div[@class='col-md-3']//a/@href"
class Product:
def __init__(self, response):
self.response = response
def get_brand_name(self):
brand_name = self.response.xpath(
"normalize-space(//*[@class='product-brand-name-details']/text())"
).extract()
if not brand_name[0]:
brand_name = self.response.xpath(
"normalize-space(//h3[@class='font-weight-bold']/text())"
).extract()
return brand_name[0] if brand_name else 'Could not get product brand name.'
def get_brand_name_details(self):
brand_name_details = self.response.xpath(
"normalize-space(//*[@class='product-name-details']/text())"
).extract()
if not brand_name_details[0]:
brand_name_details = self.response.xpath(
"normalize-space(//h1[@class='title font-weight-bold']/text())"
).extract()
return brand_name_details[0] if brand_name_details else 'Could not get product brand name details.'
def get_real_category(self):
return self.response.meta.get('product_category')
def get_sku_details(self):
sku_details = self.response.xpath(
"normalize-space(//*[@class='product-sku-details']/text())"
).extract()
if not sku_details[0]:
sku_details = self.response.xpath(
"normalize-space(//h5[@class='font-weight-bold']/text())"
).extract()
return sku_details[0] if sku_details else 'Could not get product sku details.'
def get_short_desc_details(self):
short_desc_details = self.response.xpath(
"normalize-space(//p[@class='pt-2']/text())"
).extract()
return short_desc_details[0] if short_desc_details else 'Could not get product short desc details.'
def get_detail_list_price(self):
detail_list_price = self.response.xpath(
"normalize-space(//*[@class='product-detail-list-price']//text())"
).extract()
return detail_list_price[0] if detail_list_price else 'Could not get product detail list price.'
def get_price(self):
price = self.response.xpath(
"normalize-space(//*[@class='price']//text())"
).extract()
return price[0] if price else 'Could not get product price.'
def get_detail_price_save(self):
detail_price_save = self.response.xpath(
"normalize-space(//*[@class='product-detail-price-save']//text())"
).extract()
return detail_price_save[0] if detail_price_save else 'Could not get product detail price save.'
def get_detail_note(self):
detail_note = self.response.xpath(
"normalize-space(//*[@class='product-detail-note']//text())"
).extract()
return detail_note[0] if detail_note else 'Could not get product detail note.'
def get_detail_long_desc(self):
detail_long_descriptions = self.response.xpath(
"//*[@id='desc']/node()"
).extract()
detail_long_desc = ''.join([x.strip() for x in detail_long_descriptions if x.strip()])
return detail_long_desc if detail_long_desc else 'Could not get product detail long desc.'
def get_image(self):
image = self.response.xpath(
"normalize-space(//*[@id='mainContent_imgDetail']/@src)"
).extract()
return f'https://bannersolutions.com{image[0]}' if image else 'Could not get product image.'
def get_pieces_in_stock(self):
pieces_in_stock = self.response.xpath(
"normalize-space(//*[@class='badge-success']//text())"
).extract()
return pieces_in_stock[0] if pieces_in_stock else 'Unknown pieces in stock.'
def get_meta_description(self):
meta_description = self.response.xpath(
"normalize-space(//*[@name='description']/@content)"
).extract()
return meta_description[0] if meta_description else 'Could not get product meta description.'
def to_json(self):
return {
'product_brand_name_details': self.get_brand_name_details(),
'product_brand_name': self.get_brand_name(),
'product_category': self.get_real_category(),
'product_sku_details': self.get_sku_details(),
'product_short_desc_details': self.get_short_desc_details(),
'product_detail_list_price': self.get_detail_list_price(),
'product_price': self.get_price(),
'product_detail_price_save': self.get_detail_price_save(),
'product_detail_note': self.get_detail_note(),
'product_detail_long_desc': self.get_detail_long_desc(),
'product_image': self.get_image(),
'product_in_stock': self.get_pieces_in_stock(),
'product_meta_description': self.get_meta_description()
}
class BannerSolutionsSpider(scrapy.Spider):
name = 'bannersolutions'
start_urls = ['https://bannersolutions.com/Sitemap']
allowed_domains = ['bannersolutions.com']
def start_crawl(self, response):
for url in self.start_urls:
yield scrapy.Request(url)
def parse(self, response):
for category in response.xpath('(//div[@class="col-md-3"])[1]/ul/li'):
main_category_name = category.xpath('./a/text()').get()
sub_category_name = category.xpath('./ul/li/a/text()').get()
category_url = category.xpath('./ul/li/a/@href').get()
if category_url:
yield scrapy.Request(f'https://bannersolutions.com{category_url}', callback=self.parse_categories,
meta={'product_category': f'{main_category_name}/{sub_category_name}'})
def parse_categories(self, response):
title = response.xpath('//h1[@class="title"]/text()').get()
products_in_category = re.match(r'.*\((\d+)\)', title).group(1)
no_of_requests = int(products_in_category) // 8 + 1
in_cat_id = response.url.split('/')[-1]
for i in range(1, no_of_requests):
payload = {
'pageIndex': str(i),
'inViewType': 'grid',
'inPageSize': '8',
'inCatID': in_cat_id,
'inFilters': '',
'inSortType': ''
}
yield scrapy.Request(
'https://bannersolutions.com/catalog.aspx/GetProducts',
method='POST',
headers={"content-type": "application/json"},
body=json.dumps(payload),
callback=self.parse_plm,
meta={'product_category': response.meta.get('product_category')}
)
def parse_plm(self, response):
products_str_html = json.loads(response.body).get('d')
product_url = scrapy.selector.Selector(text=products_str_html).xpath(
'//div[@class="product-image-container"]//a/@href'
).get()
yield scrapy.Request(
f'https://bannersolutions.com{product_url}',
callback=self.parse_product,
meta={'product_category': response.meta.get('product_category')}
)
def parse_product(self, response):
product = Product(response).to_json()
yield product
The issue with my code is that not all the products are being parsed, only ~3k / out of 70k. Now, Where I suppose it's the issue is between the lines 148-165. I've ran it through the debugger but I still couldn't figure out what's wrong.
Can someone please explain me what's wrong in my code logic?
Not sure if that's the only issue as I don't have time to test it further, but it seems you're only parsing the first product when you load the 8-bulk data here:
# ...
product_url = scrapy.selector.Selector(text=products_str_html).xpath(
'//div[@class="product-image-container"]//a/@href'
).get()
# ...
The .get()
method won't return all the urls. You might use the getall()
method instead which returns a list with all the urls:
# ...
product_url = scrapy.selector.Selector(text=products_str_html).xpath(
'//div[@class="product-image-container"]//a/@href'
).getall()
# ...
And then just loop over the returned list and yield what you yielded before:
# ...
products_urls = scrapy.selector.Selector(text=products_str_html).xpath(
'//div[@class="product-image-container"]//a/@href'
).getall()
for product_url in products_urls:
yield scrapy.Request(
f'https://bannersolutions.com{product_url}',
callback=self.parse_product,
meta={'product_category': response.meta.get('product_category')}
)
You made the same mistake in the parse
method of your BannerSolutionsSpider
class, as you did in parse_plm
method(highlighted by @Cajuu'). Rather using getall
method to get all the hyperlinks, you used the get
method which only returns the first URL of each sub-category.
You may try below solution, it is giving all the sub-category urls to parse.
for category in response.xpath('(//div[@class="col-md-3"])[1]/ul/li'):
main_category_name = category.xpath('./a/text()').get()
for sub_category in category.xpath('./ul/li'):
sub_category_name = sub_category.xpath('./a/text()').get()
sub_category_url = sub_category.xpath('./a/@href').get()
yield scrapy.Request(f'https://bannersolutions.com{sub_category_url}', callback=self.parse_categories, meta={'product_category': f'{main_category_name}/{sub_category_name}'})
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With