Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Scrapy crawling through pages with PostBack data javascript url doesn't change

I'm crawling through some directories with ASP.NET programming via Scrapy.

The pages to crawl through are encoded as such:

javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')

where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only extract each link within the first page.

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime


class nnggzySpider(scrapy.Spider):

    name = 'nnggzygov'
    start_urls = [
        'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
    ]

    base_url = 'https://www.nnggzy.org.cn'


    custom_settings = {
        'LOG_LEVEL': 'ERROR'
    }

    def parse(self, response):
        _response = response.text
        self.data = {}
        soup = BeautifulSoup(response.body, 'html.parser')
        tags = soup.find_all('a', href=re.compile(r"InfoDetail"))

        # 获取翻页参数
        __VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
        A = __VIEWSTATE[0]
        # print(A)
        __EVENTTARGET = 'MoreInfoListZbgs1$Pager'
        B = __EVENTTARGET
        __CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
        C = __CSRFTOKEN
        page_num = re.findall(r'title="转到第(.*?)页"', _response)
        max_page = page_num[-1]

        content = {
            '__VIEWSTATE': A,
            '__EVENTTARGET': B,
            '__CSRFTOKEN': C,
            'page_num': max_page
        }
        infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
        print(infoid)
        yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})

    def parse_detail(self, response):
        max_page = response.meta['data']['page_num']
        for i in range(2, int(max_page)):
            data = {
                '__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
                '__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
                '__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
                '__EVENTARGUMENT': '{}'.format(i),
                # '__VIEWSTATEENCRYPTED': '',
                # 'txtKey': ''
            }
            yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)

Can anyone help me with this?

like image 426
Lance Liao Avatar asked Mar 26 '26 09:03

Lance Liao


1 Answers

Looks like the pagination over mentioned website is made by sending POST requests with formdata like:

{
    "__CSRFTOKEN": ...,
    "__VIEWSTATE": ...,
    "__EVENTTARGET": "MoreInfoListZbgs1$Pager",
    "__EVENTARGUMENT": page_number,
    "__VIEWSTATEENCRYPTED": "",
    "txtKey": ""
}
like image 196
Michael Savchenko Avatar answered Mar 28 '26 23:03

Michael Savchenko



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!