Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Python async/await downloading a list of urls

I'm trying to download over 30,000 files from a FTP server, and after some googling using asynchronous IO seemed a good idea. However, the code below failed to download any files and returns a Timeout Error. I'd really appreciate any help! Thanks!

class pdb:
    def __init__(self):
        self.ids = []
        self.dl_id = []
        self.err_id = []


    async def download_file(self, session, url):
        try:
            with async_timeout.timeout(10):
                async with session.get(url) as remotefile:
                    if remotefile.status == 200:
                        data = await remotefile.read()
                        return {"error": "", "data": data}
                    else:
                        return {"error": remotefile.status, "data": ""}
        except Exception as e:
            return {"error": e, "data": ""}

    async def unzip(self, session, work_queue):
        while not work_queue.empty():
            queue_url = await work_queue.get()
            print(queue_url)
            data = await self.download_file(session, queue_url)
            id = queue_url[-11:-7]
            ID = id.upper()
            if not data["error"]:
                saved_pdb = os.path.join("./pdb", ID, f'{ID}.pdb')
                if ID not in self.dl_id:
                    self.dl_id.append(ID)
                with open(f"{id}.ent.gz", 'wb') as f:
                    f.write(data["data"].read())
                with gzip.open(f"{id}.ent.gz", "rb") as inFile, open(saved_pdb, "wb") as outFile:
                    shutil.copyfileobj(inFile, outFile)
                os.remove(f"{id}.ent.gz")
            else:
                self.err_id.append(ID)

    def download_queue(self, urls):
        loop = asyncio.get_event_loop()
        q = asyncio.Queue(loop=loop)
        [q.put_nowait(url) for url in urls]
        con = aiohttp.TCPConnector(limit=10)
        with aiohttp.ClientSession(loop=loop, connector=con) as session:
            tasks = [asyncio.ensure_future(self.unzip(session, q)) for _ in range(len(urls))]
            loop.run_until_complete(asyncio.gather(*tasks))
        loop.close()

Error message if I remove the try part:

Traceback (most recent call last):
File "test.py", line 111, in
x.download_queue(urls)
File "test.py", line 99, in download_queue
loop.run_until_complete(asyncio.gather(*tasks))
File "/home/yz/miniconda3/lib/python3.6/asyncio/base_events.py", line 467, in run_until_complete
return future.result()
File "test.py", line 73, in unzip
data = await self.download_file(session, queue_url)
File "test.py", line 65, in download_file
return {"error": remotefile.status, "data": ""}
File "/home/yz/miniconda3/lib/python3.6/site- packages/async_timeout/init.py", line 46, in exit
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError

like image 809
Yi Zhou Avatar asked Jan 14 '18 22:01

Yi Zhou


2 Answers

tasks = [asyncio.ensure_future(self.unzip(session, q)) for _ in range(len(urls))]
loop.run_until_complete(asyncio.gather(*tasks))

Here you start process of downloading concurrently for all of your urls. It means that you start to count timeout for all of them also. Once it's a big number such as 30,000 it can't be physically done within 10 seconds due to networks/ram/cpu capacity.

To avoid this situation you should guarantee limit of coroutines started simultaneously. Usually synchronization primitives like asyncio.Semaphore can be used to achieve this.

It'll look like this:

sem = asyncio.Semaphore(10)

# ...

async def download_file(self, session, url):
    try:
        async with sem:  # Don't start next download until 10 other currently running
            with async_timeout.timeout(10):
like image 120
Mikhail Gerasimov Avatar answered Nov 11 '22 16:11

Mikhail Gerasimov


As an alternative to @MikhailGerasimov's semaphore approach, you might consider using the aiostream.stream.map operator:

from aiostream import stream, pipe

async def main(urls):
    async with aiohttp.ClientSession() as session:
        ws = stream.repeat(session)
        xs = stream.zip(ws, stream.iterate(urls))
        ys = stream.starmap(xs, fetch, ordered=False, task_limit=10)
        zs = stream.map(ys, process)
        await zs

Here's an equivalent implementation using pipes:

async def main3(urls):
    async with aiohttp.ClientSession() as session:
        await (stream.repeat(session)
               | pipe.zip(stream.iterate(urls))
               | pipe.starmap(fetch, ordered=False, task_limit=10)
               | pipe.map(process))

You can test it with the following coroutines:

async def fetch(session, url):
    await asyncio.sleep(random.random())
    return url

async def process(data):
    print(data)

See more aiostream examples in this demonstration and the documentation.

Disclaimer: I am the project maintainer.

like image 5
Vincent Avatar answered Nov 11 '22 15:11

Vincent