Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Processing huge CSV file using Python and multithreading

I have a function that yields lines from a huge CSV file lazily:

def get_next_line():
    with open(sample_csv,'r') as f:
        for line in f:
            yield line

def do_long_operation(row):
    print('Do some operation that takes a long time')

I need to use threads such that each record I get from the above function I can call do_long_operation.

Most places on Internet have examples like this, and I am not very sure if I am on the right path.

import threading
thread_list = []
for i in range(8):
   t = threading.Thread(target=do_long_operation, args=(get_next_row from get_next_line))
   thread_list.append(t)

for thread in thread_list:
    thread.start()

for thread in thread_list:
    thread.join()

My questions are:

  1. How do I start only a finite number of threads, say 8?

  2. How do I make sure that each of the threads will get a row from get_next_line?

like image 364
user3249433 Avatar asked Dec 11 '22 10:12

user3249433


2 Answers

You could use a thread pool from multiprocessing and map your tasks to a pool of workers:

from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
from time import sleep


def process_line(l):
    print l, "started"
    sleep(randint(0, 3))
    print l, "done"


def get_next_line():
    with open("sample.csv", 'r') as f:
        for line in f:
            yield line

f = get_next_line()

t = Pool(processes=8)

for i in f:
    t.map(process_line, (i,))
t.close()
t.join()

This will create eight workers and submit your lines to them, one by one. As soon as a process is "free", it will be allocated a new task.

There is a commented out import statement, too. If you comment out the ThreadPool and import Pool from multiprocessing instead, you will get subprocesses instead of threads, which may be more efficient in your case.

like image 196
Hannu Avatar answered Dec 19 '22 13:12

Hannu


Using a Pool/ThreadPool from multiprocessing to map tasks to a pool of workers and a Queue to control how many tasks are held in memory (so we don't read too far ahead into the huge CSV file if worker processes are slow):

from multiprocessing.pool import ThreadPool as Pool
# from multiprocessing import Pool
from random import randint
import time, os
from multiprocessing import Queue


def process_line(l):
    print("{} started".format(l))
    time.sleep(randint(0, 3))
    print("{} done".format(l))


def get_next_line():
    with open(sample_csv, 'r') as f:
        for line in f:
            yield line

# use for testing
# def get_next_line():
#     for i in range(100):
#         print('yielding {}'.format(i))
#         yield i


def worker_main(queue):
    print("{} working".format(os.getpid()))
    while True:
        # Get item from queue, block until one is available
        item = queue.get(True)
        if item == None:
            # Shutdown this worker and requeue the item so other workers can shutdown as well
            queue.put(None)
            break
        else:
            # Process item
            process_line(item)
    print("{} done working".format(os.getpid()))


f = get_next_line()

# Use a multiprocessing queue with maxsize
q = Queue(maxsize=5)

# Start workers to process queue items
t = Pool(processes=8, initializer=worker_main, initargs=(q,))

# Enqueue items. This blocks if the queue is full.
for l in f:
    q.put(l)

# Enqueue the shutdown message (i.e. None)
q.put(None)

# We need to first close the pool before joining
t.close()
t.join()
like image 32
KB5 Avatar answered Dec 19 '22 12:12

KB5