This is my code, it contains no recursion, but it hits maximum recursion depth on first pickle...
Code:
#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import pickle
# open page and return soup list
def get_page_startups(page_url):
html = urlopen(page_url).read()
soup = BeautifulSoup(html, "lxml")
return soup.find_all("div","startup item")
#
# Get certain text from startup soup
#
def get_name(startup):
return startup.find("a", "profile").string
def get_website(startup):
return startup.find("a", "visit")["href"]
def get_status(startup):
return startup.find("p","status").strong.string[8:]
def get_twitter(startup):
return startup.find("a", "comment").string
def get_high_concept_pitch(startup):
return startup.find("div","headline").find_all("em")[1].string
def get_elevator_pitch(startup):
startup_soup = BeautifulSoup(urlopen("http://startupli.st" + startup.find("a","profile")["href"]).read(),"lxml")
return startup_soup.find("p", "desc").string.rstrip().lstrip()
def get_tags(startup):
return startup.find("p","tags").string
def get_blog(startup):
try:
return startup.find("a","visit blog")["href"]
except TypeError:
return None
def get_facebook(startup):
try:
return startup.find("a","visit facebook")["href"]
except TypeError:
return None
def get_angellist(startup):
try:
return startup.find("a","visit angellist")["href"]
except TypeError:
return None
def get_linkedin(startup):
try:
return startup.find("a","visit linkedin")["href"]
except TypeError:
return None
def get_crunchbase(startup):
try:
return startup.find("a","visit crunchbase")["href"]
except TypeError:
return None
# site to scrape
BASE_URL = "http://startupli.st/startups/latest/"
# scrape all pages
for page_no in xrange(1,142):
startups = get_page_startups(BASE_URL + str(page_no))
# search soup and pickle data
for i, startup in enumerate(startups):
s = {}
s['name'] = get_name(startup)
s['website'] = get_website(startup)
s['status'] = get_status(startup)
s['high_concept_pitch'] = get_high_concept_pitch(startup)
s['elevator_pitch'] = get_elevator_pitch(startup)
s['tags'] = get_tags(startup)
s['twitter'] = get_twitter(startup)
s['facebook'] = get_facebook(startup)
s['blog'] = get_blog(startup)
s['angellist'] = get_angellist(startup)
s['linkedin'] = get_linkedin(startup)
s['crunchbase'] = get_crunchbase(startup)
f = open(str(i)+".pkl", "wb")
pickle.dump(s,f)
f.close()
print "Done " + str(page_no)
This is the content of 0.pkl
after the exception is raised:
http://pastebin.com/DVS1GKzz Thousand lines long!
There's some HTML from the BASE_URL in the pickle... but I didn't pickle any html strings...
BeautifulSoup .string
attributes aren't actually strings:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup('<div>Foo</div>')
>>> soup.find('div').string
u'Foo'
>>> type(soup.find('div').string)
bs4.element.NavigableString
Try using str(soup.find('div').string)
instead and see if it helps. Also, I don't think Pickle is really the best format here. JSON is much easier in this case.
Most likely pickle is doing recursion internally, and the file you are trying parse is to big. You could try to increase the limit of the number of recursions allowed.
import sys
sys.setrecursionlimit(10000)
This is however not recommended for any type of production ready application, as it may mask actual issue, but could help highlight issue(s) during debugging.
Pickle cannot handle BeautifulSoup nodes. Similar questions with some workarounds:
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With