I'm attempting to rename the images that are downloaded by my Scrapy 0.24 spider. Right now the downloaded images are stored with a SHA1 hash of their URLs as the file names. I'd like to instead name them the value I extract with item['model']
. This question from 2011 outlines what I want, but the answers are for previous versions of Scrapy and don't work with the latest version.
Once I manage to get this working I'll also need to make sure I account for different images being downloaded with the same filename. So I'll need to download each image to its own uniquely named folder, presumably based on the original URL.
Here is a copy of the code I am using in my pipeline. I got this code from a more recent answer in the link above, but it's not working for me. Nothing errors out and the images are downloaded as normal. It doesn't seem my extra code has any effect on the filenames as they still appear as SHA1 hashes.
pipelines.py
class AllenheathPipeline(object):
def process_item(self, item, spider):
return item
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. I don't know, how to put it in one line :-)
for image in item['images']:
yield Request(image)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py
BOT_NAME = 'allenheath'
SPIDER_MODULES = ['allenheath.spiders']
NEWSPIDER_MODULE = 'allenheath.spiders'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = 'c:/allenheath/images'
products.py (my spider)
import scrapy
import urlparse
from allenheath.items import ProductItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
class productsSpider(scrapy.Spider):
name = "products"
allowed_domains = ["http://www.allen-heath.com/"]
start_urls = [
"http://www.allen-heath.com/ahproducts/ilive-80/",
"http://www.allen-heath.com/ahproducts/ilive-112/"
]
def parse(self, response):
for sel in response.xpath('/html'):
item = ProductItem()
item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract() # The value I'd like to use to name my images.
item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract()
item['desc'] = sel.css('#tab1 #productcontent').extract()
item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract()
item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract()
item['image_urls'] = sel.css('#tab1 #productcontent .col-sm-9 img').xpath('./@src').extract()
item['image_urls'] = [urlparse.urljoin(response.url, url) for url in item['image_urls']]
yield item
items.py
import scrapy
class ProductItem(scrapy.Item):
model = scrapy.Field()
itemcode = scrapy.Field()
shortdesc = scrapy.Field()
desc = scrapy.Field()
series = scrapy.Field()
imageorig = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
Here's a pastebin of the output I get from the command prompt when I run the spider: http://pastebin.com/ir7YZFqf
Any help would be greatly appreciated!
The pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy import log
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
image_guid = request.meta['model'][0]
log.msg(image_guid, level=log.DEBUG)
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
log.msg(image_guid, level=log.DEBUG)
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
yield Request(item['image_urls'][0], meta=item)
You're using the settings.py
wrong. You should use this:
ITEM_PIPELINES = {'allenheath.pipelines.MyImagesPipeline': 1}
For thumbsnails to work, add this to settings.py
:
IMAGES_THUMBS = {
'small': (50, 50),
'big': (100, 100),
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With