Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to obtain all the links in a domain using Python?

I want to use Python to obtain all the links in a domain given the 'root' URL (in a list). Suppose given a URL http://www.example.com this should return all the links on this page of the same domain as the root URL, then recurse on each of these links visiting them and extracting all the links of the same domain and so on. What I mean by same domain is if given http://www.example.com the only links I want back are http://www.example.com/something, http://www.example.com/somethingelse ... Anything external such as http://www.otherwebsite.com should be discarded. How can I do this using Python?

EDIT: I made an attempt using lxml. I don't think this works fully, and I am not sure how to take into account links to already processed pages (causing infinite loop).

import urllib
import lxml.html

#given a url returns list of all sublinks within the same domain
def getLinks(url):
        urlList = []
        urlList.append(url)
        sublinks = getSubLinks(url)
        for link in sublinks:
                absolute = url+'/'+link
                urlList.extend(getLinks(absolute))
         return urlList

#determine whether two links are within the same domain
def sameDomain(url, dom):
    return url.startswith(dom)

#get tree of sublinks in same domain, url is root
def getSubLinks(url):
    sublinks = []
    connection = urllib.urlopen(url)
    dom = lxml.html.fromstring(connection.read())
    for link in dom.xpath('//a/@href'):
                if not (link.startswith('#') or link.startswith('http') or link.startswith('mailto:')):
                        sublinks.append(link)
    return sublinks

~

like image 837
j x Avatar asked Nov 26 '22 05:11

j x


1 Answers

import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime

def get_soup(link):
    """
    Return the BeautifulSoup object for input link
    """
    request_object = requests.get(link, auth=('user', 'pass'))
    soup = BeautifulSoup(request_object.content)
    return soup

def get_status_code(link):
    """
    Return the error code for any url
    param: link
    """
    try:
        error_code = requests.get(link).status_code
    except requests.exceptions.ConnectionError:
        error_code = 
    return error_code

def find_internal_urls(lufthansa_url, depth=0, max_depth=2):
    all_urls_info = []
    status_dict = {}
    soup = get_soup(lufthansa_url)
    a_tags = soup.findAll("a", href=True)

    if depth > max_depth:
        return {}
    else:
        for a_tag in a_tags:
            if "http" not in a_tag["href"] and "/" in a_tag["href"]:
                url = "http://www.lufthansa.com" + a_tag['href']
            elif "http" in a_tag["href"]:
                url = a_tag["href"]
            else:
                continue
            status_dict["url"] = url
            status_dict["status_code"] = get_status_code(url)
            status_dict["timestamp"] = datetime.now()
            status_dict["depth"] = depth + 1
            all_urls_info.append(status_dict)
    return all_urls_info
if __name__ == "__main__":
    depth = 2 # suppose 
    all_page_urls = find_internal_urls("someurl", 2, 2)
    if depth > 1:
        for status_dict in all_page_urls:
            find_internal_urls(status_dict['url'])

The above snippet contains necessary modules for scrapping urls from lufthansa arlines website. The only thing additional here is you can specify depth to which you want to scrape recursively.

like image 116
Namita Maharanwar Avatar answered Nov 29 '22 12:11

Namita Maharanwar