Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How can I download images on a page using puppeteer?

I'm new to web scraping and want to download all images on a webpage using puppeteer:

const puppeteer = require('puppeteer');

let scrape = async () => {
  // Actual Scraping goes Here...

  const browser = await puppeteer.launch({headless: false});
  const page = await browser.newPage();
  await page.goto('https://memeculture69.tumblr.com/');

  //   Right click and save images

};

scrape().then((value) => {
    console.log(value); // Success!
});

I have looked at the API‌ docs but could not figure out how to acheive this. So appreciate your help.

like image 1000
supermario Avatar asked Sep 27 '18 17:09

supermario


People also ask

How do I download files from puppeteer?

We can use the following script to automate the download process. const puppeteer = require('puppeteer'); async function simplefileDownload() { const browser = await puppeteer. launch({ headless: false }); const page = await browser. newPage(); await page.

How do you take a screenshot on a puppeteer?

To take the screenshot of the page area, use the clip option: 'use strict'; const puppeteer = require('puppeteer'); (async () => { const browser = await puppeteer. launch(); try { const page = await browser. newPage(); await page.


4 Answers

If you want to skip the manual dom traversal you can write the images to disk directly from the page response.

Example:

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    page.on('response', async response => {
        const url = response.url();
        if (response.request().resourceType() === 'image') {
            response.buffer().then(file => {
                const fileName = url.split('/').pop();
                const filePath = path.resolve(__dirname, fileName);
                const writeStream = fs.createWriteStream(filePath);
                writeStream.write(file);
            });
        }
    });
    await page.goto('https://memeculture69.tumblr.com/');
    await browser.close();
})();

See the documentation for page.on and for the HTTPResponse object that you get from page.on('response', ...).

like image 56
Ben Adam Avatar answered Oct 16 '22 15:10

Ben Adam


Here is another example. It goes to a generic search in google and downloads the google image at the top left.

const puppeteer = require('puppeteer');
const fs = require('fs');

async function run() {
    const browser = await puppeteer.launch({
        headless: false
    });
    const page = await browser.newPage();
    await page.setViewport({ width: 1200, height: 1200 });
    await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');

    const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
    let imageHref = await page.evaluate((sel) => {
        return document.querySelector(sel).getAttribute('src').replace('/', '');
    }, IMAGE_SELECTOR);

    console.log("https://www.google.com/" + imageHref);
    var viewSource = await page.goto("https://www.google.com/" + imageHref);
    fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
    if (err) {
        return console.log(err);
    }

    console.log("The file was saved!");
});

    browser.close();
}

run();

If you have a list of images you want to download then you could change the selector to programatically change as needed and go down the list of images downloading them one at a time.

like image 26
Braden Brown Avatar answered Oct 16 '22 14:10

Braden Brown


You can use the following to scrape an array of all the src attributes of all images on the page:

const images = await page.evaluate(() => Array.from(document.images, e => e.src));

Then you can use the Node File System Module and HTTP or HTTPS Module to download each image.

Complete Example:

'use strict';

const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');

/* ============================================================
  Promise-Based Download Function
============================================================ */

const download = (url, destination) => new Promise((resolve, reject) => {
  const file = fs.createWriteStream(destination);

  https.get(url, response => {
    response.pipe(file);

    file.on('finish', () => {
      file.close(resolve(true));
    });
  }).on('error', error => {
    fs.unlink(destination);

    reject(error.message);
  });
});

/* ============================================================
  Download All Images
============================================================ */

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  let result;

  await page.goto('https://www.example.com/');

  const images = await page.evaluate(() => Array.from(document.images, e => e.src));

  for (let i = 0; i < images.length; i++) {
    result = await download(images[i], `image-${i}.png`);

    if (result === true) {
      console.log('Success:', images[i], 'has been downloaded successfully.');
    } else {
      console.log('Error:', images[i], 'was not downloaded.');
      console.error(result);
    }
  }

  await browser.close();
})();
like image 14
Grant Miller Avatar answered Oct 16 '22 15:10

Grant Miller


The logic is simple i think. You just need to make a function which will take url of image and save it to your directory. The puppeteer will just scrape the image url and pass it to downloader function. Here is an example:

const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');

//  This is main download function which takes the url of your image
function download(uri, filename) {
  return new Promise((resolve, reject) => {
    request.head(uri, function (err, res, body) {
      request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
    });
  });
}

let main = async () => {
  const browser = await puppeteer.launch({ headless: false });
  const page = await browser.newPage();
  await page.goto('https://memeculture69.tumblr.com/');
  await page.waitFor(1000);
  const imageUrl = await page.evaluate(
    // here we got the image url from the selector.
    () => document.querySelector('img.image')
  );
  // Now just simply pass the image url
  // to the downloader function to download  the image.
  await download(imageUrl, 'image.png');
};

main();
like image 9
Naimur Rahman Avatar answered Oct 16 '22 15:10

Naimur Rahman