Following Introduction to Computer Science track at Udacity, I'm trying to make a python script to extract links from page, below is the code I used:
I got the following error
NameError: name 'page' is not defined
Here is the code:
def get_page(page):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ''
start_link = page.find('<a href=')
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return (None, 0)
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return (url, end_quote)
(url, end_pos) = get_next_target(page)
page = page[end_pos:]
def print_all_links(page):
while True:
(url, end_pos) = get_next_target(page)
if url:
print(url)
page = page[:end_pos]
else:
break
print_all_links(get_page("http://xkcd.com/"))
page
is undefined and that is the cause of error.
For web scraping like this, you can simply use beautifulSoup
:
from bs4 import BeautifulSoup, SoupStrainer
import requests
url = "http://stackoverflow.com/"
page = requests.get(url)
data = page.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
print(link.get('href'))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With