I am trying to extract and download all images from a url. I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg I just want to get all the images without clicking on "Next" button I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?
A simple code to perform the download: parser') image_tags = soup. find_all('img') urls = [img['src'] for img in image_tags] for url in urls: filename = re.search(r'/([\w_-]+[.]( jpg|gif|png))$', url) if not filename: print("Regular expression didn't match with the url: {}". format(url)) continue with open(filename.
The following should extract all images from a given page and write it to the directory where the script is being run.
import re import requests from bs4 import BeautifulSoup site = 'http://pixabay.com' response = requests.get(site) soup = BeautifulSoup(response.text, 'html.parser') img_tags = soup.find_all('img') urls = [img['src'] for img in img_tags] for url in urls: filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url) if not filename: print("Regex didn't match with the url: {}".format(url)) continue with open(filename.group(1), 'wb') as f: if 'http' not in url: # sometimes an image source can be relative # if it is provide the base url which also happens # to be the site variable atm. url = '{}{}'.format(site, url) response = requests.get(url) f.write(response.content)
Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re import requests from bs4 import BeautifulSoup site = 'http://www.google.com' response = requests.get(site) soup = BeautifulSoup(response.text, 'html.parser') img_tags = soup.find_all('img') urls = [img['src'] for img in img_tags] for url in urls: filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url) if not filename: print("Regex didn't match with the url: {}".format(url)) continue with open(filename.group(1), 'wb') as f: if 'http' not in url: # sometimes an image source can be relative # if it is provide the base url which also happens # to be the site variable atm. url = '{}{}'.format(site, url) response = requests.get(url) f.write(response.content)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With