I am an crawling beginner using Puppeteer. I succeeded in crawling the below site. Below is a code for extracting a specific product name from the shopping mall.
const express = require('express');
const puppeteer = require('puppeteer');
const app = express();
(async () => {
const width = 1600, height = 1040;
const option = { headless: true, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
// 네이버 스토어팜
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('ul.goods_list');
await page.addScriptTag({url: 'https://code.jquery.com/jquery-3.2.1.min.js'});
const result = await page.evaluate(() => {
const data = [];
$('ul.goods_list > li._itemSection').each(function () {
const title = $.trim($(this).find('div.info > a.tit').text());
const price = $(this).find('div.info > .price .num').text();
const image = $(this).find('div.img_area img').attr('src');
data.push({ title, price, image })
});
return data;
});
console.log(result);
await browser.close();
})();
app.listen(3000, () => console.log("Express!!!"));
I have a question. If I want to get information from number of pages, What should I do? for example ( 1 page, 2 page , 3page .... )
use attribute footerTemplate with displayHeaderFooter for show pages originally using puppeteer API
await page.pdf({
path: 'hacks.pdf',
format: 'A4',
displayHeaderFooter: true,
footerTemplate: '<div><div class='pageNumber'></div> <div>/</div><div class='totalPages'></div></div>'
});
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagepdfoptions
// footerTemplate HTML template for the print footer.
// Should be valid HTML markup with following CSS classes used to inject printing values into them:
// - date formatted print date
// - title document title
// - url document location
// - pageNumber current page number
// - totalPages total pages in the document
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With