Scrapy crawling through pages with PostBack data javascript url doesn't change

Question

I'm crawling through some directories with ASP.NET programming via Scrapy.

The pages to crawl through are encoded as such:

javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')

where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only extract each link within the first page.

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime


class nnggzySpider(scrapy.Spider):

    name = 'nnggzygov'
    start_urls = [
        'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
    ]

    base_url = 'https://www.nnggzy.org.cn'


    custom_settings = {
        'LOG_LEVEL': 'ERROR'
    }

    def parse(self, response):
        _response = response.text
        self.data = {}
        soup = BeautifulSoup(response.body, 'html.parser')
        tags = soup.find_all('a', href=re.compile(r"InfoDetail"))

        # 获取翻页参数
        __VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
        A = __VIEWSTATE[0]
        # print(A)
        __EVENTTARGET = 'MoreInfoListZbgs1$Pager'
        B = __EVENTTARGET
        __CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
        C = __CSRFTOKEN
        page_num = re.findall(r'title="转到第(.*?)页"', _response)
        max_page = page_num[-1]

        content = {
            '__VIEWSTATE': A,
            '__EVENTTARGET': B,
            '__CSRFTOKEN': C,
            'page_num': max_page
        }
        infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
        print(infoid)
        yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})

    def parse_detail(self, response):
        max_page = response.meta['data']['page_num']
        for i in range(2, int(max_page)):
            data = {
                '__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
                '__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
                '__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
                '__EVENTARGUMENT': '{}'.format(i),
                # '__VIEWSTATEENCRYPTED': '',
                # 'txtKey': ''
            }
            yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)

Can anyone help me with this?

Michael Savchenko · Accepted Answer

Looks like the pagination over mentioned website is made by sending POST requests with formdata like:

{
    "__CSRFTOKEN": ...,
    "__VIEWSTATE": ...,
    "__EVENTTARGET": "MoreInfoListZbgs1$Pager",
    "__EVENTARGUMENT": page_number,
    "__VIEWSTATEENCRYPTED": "",
    "txtKey": ""
}

Scrapy crawling through pages with PostBack data javascript url doesn't change

Tags:

python

scrapy

web-crawler

Lance Liao

1 Answers

Michael Savchenko

Recent Activity

Donate For Us

Scrapy crawling through pages with PostBack data javascript url doesn't change

Tags:

python

scrapy

web-crawler

Lance Liao

1 Answers

Michael Savchenko

Related questions

Recent Activity

Donate For Us