Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Python large multi-list efficient query

I'm trying to create examples on how to manipulate massive databases composed of CSV tables using only Python.

I'd like to find out a way to emulate efficient indexed queries in tables spread through some list()

The example below takes 24 seconds in a 3.2Ghz Core i5

#!/usr/bin/env python
import csv
MAINDIR = "../"
pf = open (MAINDIR+"atp_players.csv")
players = [p for p in csv.reader(pf)]
rf = open (MAINDIR+"atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:10]:
    player = filter(lambda x: x[0]==i[2],players)[0]
    print "%s(%s),(%s) Points: %s"%(player[2],player[5],player[3],i[3])

For this dataset.

A more efficient, or more pythonic way would be greatly appreciated.

like image 332
ppaulojr Avatar asked Mar 17 '23 09:03

ppaulojr


1 Answers

You can itertools.islice instead of reading all rows and use itertools.ifilter:

import csv
from itertools import islice,ifilter

MAINDIR = "../"
with  open(MAINDIR + "atp_players.csv") as pf,  open(MAINDIR + "atp_rankings_current.csv") as rf:
    players = list(csv.reader(pf))
    rankings = csv.reader(rf)
    # only get first ten rows using islice
    for i in islice(rankings, None, 10):
        # ifilter won't create a list, gives values in the fly
        player = next(ifilter(lambda x: x[0] == i[2], players),"")

Not quite sure what filter(lambda x: x[0]==i[2],players)[0] is doing, you seem to be searching the whole players list each time and just keeping the first element. It might pay to sort the list once with the first element as the key and use bisection search or build a dict with the first element as the key and the row as the value then simply do lookups.

import csv
from itertools import islice,ifilter
from collections import OrderedDict

MAINDIR = "../"
with  open(MAINDIR + "atp_players.csv") as pf,  open(MAINDIR + "atp_rankings_current.csv") as rf:
    players = OrderedDict((row[0],row) for row in csv.reader(pf))
    rankings = csv.reader(rf)
    for i in islice(rankings, None, 10):
        # now constant work getting row as opposed to 0(n)    
        player = players.get(i[2])

What default value you use or indeed if any is needed you will have to decide.

If you have repeating elements at the start of each row but just want to return the first occurrence:

with  open(MAINDIR + "atp_players.csv") as pf, open(MAINDIR + "atp_rankings_current.csv") as rf:
    players = {}
    for row in csv.reader(pf):
        key = row[0]
        if key in players:
            continue
        players[key] = row
    rankings = csv.reader(rf)
    for i in islice(rankings, None, 10):
        player = players.get(i[2])

Output:

Djokovic(SRB),(R) Points: 11360
Federer(SUI),(R) Points: 9625
Nadal(ESP),(L) Points: 6585
Wawrinka(SUI),(R) Points: 5120
Nishikori(JPN),(R) Points: 5025
Murray(GBR),(R) Points: 4675
Berdych(CZE),(R) Points: 4600
Raonic(CAN),(R) Points: 4440
Cilic(CRO),(R) Points: 4150
Ferrer(ESP),(R) Points: 4045

Timing for the code on ten players shows ifilter to be the fastest but we will see the dict winning when we increase rankings and just how badly your code scales:

In [33]: %%timeit
MAINDIR = "tennis_atp-master/"
pf = open ("/tennis_atp-master/atp_players.csv")                                                          players = [p for p in csv.reader(pf)]
rf =open( "/tennis_atp-master/atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:10]:
    player = filter(lambda x: x[0]==i[2],players)[0]
   ....: 
10 loops, best of 3: 123 ms per loop

In [34]: %%timeit
with  open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf:                     players = list(csv.reader(pf))
    rankings = csv.reader(rf)    # only get first ten rows using islice
    for i in islice(rankings, None, 10):
        # ifilter won't create a list, gives values in the fly
        player = next(ifilter(lambda x: x[0] == i[2], players),"")
   ....: 
10 loops, best of 3: 43.6 ms per loop

In [35]: %%timeit                           
with  open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf:
    players = {}
    for row in csv.reader(pf):
        key = row[0]
        if key in players:
            continue
        players[row[0]] = row
    rankings = csv.reader(rf)
    for i in islice(rankings, None, 10):
        player = players.get(i[2])
        pass
   ....: 
10 loops, best of 3: 50.7 ms per loop

Now with 100 players you will see the dict is as fast as it was for 10. The cost of building the dict has been offset by constant time lookups:

In [38]: %%timeit
with  open("/tennis_atp-master/atp_players.csv") as pf, open("/tennis_atp-master/atp_rankings_current.csv") as rf:
    players = list(csv.reader(pf))
    rankings = csv.reader(rf)
    # only get first ten rows using islice
    for i in islice(rankings, None, 100):
        # ifilter won't create a list, gives values in the fly
        player = next(ifilter(lambda x: x[0] == i[2], players),"")
   ....: 
10 loops, best of 3: 120 ms per loop

In [39]: %%timeit
with  open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf:
    players = {}                  
    for row in csv.reader(pf):
        key = row[0]
        if key in players:
            continue                                          
        players[row[0]] = row                                     
    rankings = csv.reader(rf)
    for i in islice(rankings, None, 100):
        player = players.get(i[2])
        pass
   ....: 
10 loops, best of 3: 50.7 ms per loop

In [40]: %%timeit
MAINDIR = "tennis_atp-master/"
pf = open ("/tennis_atp-master/atp_players.csv")
players = [p for p in csv.reader(pf)]
rf =open( "/tennis_atp-master/atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:100]:
    player = filter(lambda x: x[0]==i[2],players)[0]
   ....: 
1 loops, best of 3: 806 ms per loop

For 250 players:

# your code
1 loops, best of 3: 1.86 s per loop

# dict
10 loops, best of 3: 50.7 ms per loop

# ifilter
10 loops, best of 3: 483  ms per loop

The final test looping over the whole rankings:

# your code

1 loops, best of 3: 2min 40s per loop

# dict 
10 loops, best of 3: 67 ms per loop

# ifilter
1 loops, best of 3: 1min 3s per loop

So you can see as we loop over more rankings the dict option is by far the most efficient as far as runtime goes and will scale extremely well.

like image 126
Padraic Cunningham Avatar answered Mar 28 '23 17:03

Padraic Cunningham