I try to organize pool with maximum 10 concurrent downloads. The function should download base url, then parser all urls on this page and download each of them, but OVERALL number of concurrent downloads should not exceed 10.
from lxml import etree
import gevent
from gevent import monkey, pool
import requests
monkey.patch_all()
urls = [
'http://www.google.com',
'http://www.yandex.ru',
'http://www.python.org',
'http://stackoverflow.com',
# ... another 100 urls
]
LINKS_ON_PAGE=[]
POOL = pool.Pool(10)
def parse_urls(page):
html = etree.HTML(page)
if html:
links = [link for link in html.xpath("//a/@href") if 'http' in link]
# Download each url that appears in the main URL
for link in links:
data = requests.get(link)
LINKS_ON_PAGE.append('%s: %s bytes: %r' % (link, len(data.content), data.status_code))
def get_base_urls(url):
# Download the main URL
data = requests.get(url)
parse_urls(data.content)
How can I organize it to go concurrent way, but to keep the general global Pool limit for ALL web requests?
I think the following should get you what you want. I'm using BeautifulSoup in my example instead the link striping stuff you had.
from bs4 import BeautifulSoup
import requests
import gevent
from gevent import monkey, pool
monkey.patch_all()
jobs = []
links = []
p = pool.Pool(10)
urls = [
'http://www.google.com',
# ... another 100 urls
]
def get_links(url):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
links.extend(soup.find_all('a'))
for url in urls:
jobs.append(p.spawn(get_links, url))
gevent.joinall(jobs)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With