Following Introduction to Computer Science track at Udacity, I'm trying to make a python script to extract links from page, below is the code I used:
I got the following error
NameError: name 'page' is not defined
Here is the code:
def get_page(page):
    try:
        import urllib
        return urllib.urlopen(url).read()
    except:
        return ''
start_link = page.find('<a href=')
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return (None, 0)
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return (url, end_quote)
(url, end_pos) = get_next_target(page)
page = page[end_pos:]
def print_all_links(page):
    while True:
        (url, end_pos) = get_next_target(page)
        if url:
            print(url)
            page = page[:end_pos]
        else:
            break
print_all_links(get_page("http://xkcd.com/"))
                page is undefined and that is the cause of error.
For web scraping like this, you can simply use beautifulSoup:
from bs4 import BeautifulSoup, SoupStrainer
import requests
url = "http://stackoverflow.com/"
page = requests.get(url)    
data = page.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
    print(link.get('href'))
                        If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With