Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

I am using BeautifulSoup, how can I get the link after the redirect?

I want to get the link after the redirect of the download link in the article page.

For example: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

In the above article page, there are the following download links: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=

Open this link directly, it will not redirect to the real download link, you need to open it in the article page.

# coding=utf-8

import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen


def urlopen(url):
    '''
    using requests to replace urllib.requests.urlopen
    return an html
    '''
    headers = {"User-Agent":"Mozilla/5.0"}
    r = requests.get(url, headers=headers)
    return r.text

def generate_pages(subTitle,fromPage,toPage):
    '''
    return  page sites' url list
    '''
    pages = []
    if(fromPage > 0 and fromPage<toPage):
        for i in range(fromPage,toPage+1):
            pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
    return pages



def get_book_sites_of_one_page(page):
    '''
    get book site's url in one page
    input: page site url
    output: book site urls list
    return book sites in one page
    '''
    html = urlopen(page)
    soup = BeautifulSoup(html,'html.parser')
    linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
    bookSites= []
    for link in linkList[::2]:
        if 'href' in link.attrs:
            #print(link)
            bookSites.append(link.attrs['href'])
    return bookSites


def get_book_urls(bookSite):
    '''
    input a book site
    find book downloading urls in this book site
    then
    return them as a list
    '''
    bookURLs=[]
    html = urlopen(bookSite)
    soup = BeautifulSoup(html,'lxml')
    linkList = soup.findAll("a",{"target":"_blank"})
    for link in linkList[::2]:
        # print(link)
        if 'href' in link.attrs:
            bookURLs.append(link.attrs['href'])
    return bookURLs


def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
    bookSites = []
    bookURLs = []
    pages = generate_pages(subTitle,fromPage, toPage)

    for page in pages:
        bookSiteOfOnePage=get_book_sites_of_one_page(page)
        bookSites.extend(bookSiteOfOnePage)

    for bookSite in bookSites:
        book_urls=get_book_urls(bookSite)
        bookURLs += book_urls

    for bookURL in bookURLs:
        print(bookURL)

    #with open(filename, 'w') as f:
    #    f.write(bookURLs)


def main():
    if(len(sys.argv) == 4):
        '''
        python getUrl.py 1, 100, programming
        from page 1 to page in subject programming
        '''
        subTitle = str(sys.argv[3])
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        get_all_book_urls(fromPage, toPage, subTitle)

    if(len(sys.argv) == 3):
        '''
        python getUrl.py 1 100
        from page 1 to page 100
        '''
        subTitle = ''
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        #filename = subTitle="-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv) == 2):
        '''
        python getUrl.py 10
        from page 10 to page 10
        only download books on page 10
        '''
        fromPage = int(sys.argv[1])
        toPage = fromPage + 1
        subTitle = ''
        #filename = "All-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv)== 1):
        fromPage = 1
        # custom page range
        toPage = 2
        subTitle = ''

        #filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)
    else:
        print("Error, too many arguments")



if __name__ == '__main__':

    #filename = ''
    main()

Thank you for your help!

like image 462
hello123 Avatar asked Nov 16 '25 00:11

hello123


1 Answers

This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.

import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url) 

Output

https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
like image 165
Bitto Bennichan Avatar answered Nov 17 '25 21:11

Bitto Bennichan



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!