I'm trying to create examples on how to manipulate massive databases composed of CSV tables using only Python.
I'd like to find out a way to emulate efficient indexed queries in tables spread through some list()
The example below takes 24 seconds in a 3.2Ghz Core i5
#!/usr/bin/env python
import csv
MAINDIR = "../"
pf = open (MAINDIR+"atp_players.csv")
players = [p for p in csv.reader(pf)]
rf = open (MAINDIR+"atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:10]:
player = filter(lambda x: x[0]==i[2],players)[0]
print "%s(%s),(%s) Points: %s"%(player[2],player[5],player[3],i[3])
For this dataset.
A more efficient, or more pythonic way would be greatly appreciated.
You can itertools.islice
instead of reading all rows and use itertools.ifilter
:
import csv
from itertools import islice,ifilter
MAINDIR = "../"
with open(MAINDIR + "atp_players.csv") as pf, open(MAINDIR + "atp_rankings_current.csv") as rf:
players = list(csv.reader(pf))
rankings = csv.reader(rf)
# only get first ten rows using islice
for i in islice(rankings, None, 10):
# ifilter won't create a list, gives values in the fly
player = next(ifilter(lambda x: x[0] == i[2], players),"")
Not quite sure what filter(lambda x: x[0]==i[2],players)[0]
is doing, you seem to be searching the whole players list each time and just keeping the first element. It might pay to sort the list once with the first element as the key and use bisection search or build a dict with the first element as the key and the row as the value then simply do lookups.
import csv
from itertools import islice,ifilter
from collections import OrderedDict
MAINDIR = "../"
with open(MAINDIR + "atp_players.csv") as pf, open(MAINDIR + "atp_rankings_current.csv") as rf:
players = OrderedDict((row[0],row) for row in csv.reader(pf))
rankings = csv.reader(rf)
for i in islice(rankings, None, 10):
# now constant work getting row as opposed to 0(n)
player = players.get(i[2])
What default value you use or indeed if any is needed you will have to decide.
If you have repeating elements at the start of each row but just want to return the first occurrence:
with open(MAINDIR + "atp_players.csv") as pf, open(MAINDIR + "atp_rankings_current.csv") as rf:
players = {}
for row in csv.reader(pf):
key = row[0]
if key in players:
continue
players[key] = row
rankings = csv.reader(rf)
for i in islice(rankings, None, 10):
player = players.get(i[2])
Output:
Djokovic(SRB),(R) Points: 11360
Federer(SUI),(R) Points: 9625
Nadal(ESP),(L) Points: 6585
Wawrinka(SUI),(R) Points: 5120
Nishikori(JPN),(R) Points: 5025
Murray(GBR),(R) Points: 4675
Berdych(CZE),(R) Points: 4600
Raonic(CAN),(R) Points: 4440
Cilic(CRO),(R) Points: 4150
Ferrer(ESP),(R) Points: 4045
Timing for the code on ten players shows ifilter to be the fastest but we will see the dict winning when we increase rankings and just how badly your code scales:
In [33]: %%timeit
MAINDIR = "tennis_atp-master/"
pf = open ("/tennis_atp-master/atp_players.csv") players = [p for p in csv.reader(pf)]
rf =open( "/tennis_atp-master/atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:10]:
player = filter(lambda x: x[0]==i[2],players)[0]
....:
10 loops, best of 3: 123 ms per loop
In [34]: %%timeit
with open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf: players = list(csv.reader(pf))
rankings = csv.reader(rf) # only get first ten rows using islice
for i in islice(rankings, None, 10):
# ifilter won't create a list, gives values in the fly
player = next(ifilter(lambda x: x[0] == i[2], players),"")
....:
10 loops, best of 3: 43.6 ms per loop
In [35]: %%timeit
with open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf:
players = {}
for row in csv.reader(pf):
key = row[0]
if key in players:
continue
players[row[0]] = row
rankings = csv.reader(rf)
for i in islice(rankings, None, 10):
player = players.get(i[2])
pass
....:
10 loops, best of 3: 50.7 ms per loop
Now with 100 players you will see the dict is as fast as it was for 10. The cost of building the dict has been offset by constant time lookups:
In [38]: %%timeit
with open("/tennis_atp-master/atp_players.csv") as pf, open("/tennis_atp-master/atp_rankings_current.csv") as rf:
players = list(csv.reader(pf))
rankings = csv.reader(rf)
# only get first ten rows using islice
for i in islice(rankings, None, 100):
# ifilter won't create a list, gives values in the fly
player = next(ifilter(lambda x: x[0] == i[2], players),"")
....:
10 loops, best of 3: 120 ms per loop
In [39]: %%timeit
with open("/tennis_atp-master/atp_players.csv") as pf, open( "/tennis_atp-master/atp_rankings_current.csv") as rf:
players = {}
for row in csv.reader(pf):
key = row[0]
if key in players:
continue
players[row[0]] = row
rankings = csv.reader(rf)
for i in islice(rankings, None, 100):
player = players.get(i[2])
pass
....:
10 loops, best of 3: 50.7 ms per loop
In [40]: %%timeit
MAINDIR = "tennis_atp-master/"
pf = open ("/tennis_atp-master/atp_players.csv")
players = [p for p in csv.reader(pf)]
rf =open( "/tennis_atp-master/atp_rankings_current.csv")
rankings = [r for r in csv.reader(rf)]
for i in rankings[:100]:
player = filter(lambda x: x[0]==i[2],players)[0]
....:
1 loops, best of 3: 806 ms per loop
For 250 players:
# your code
1 loops, best of 3: 1.86 s per loop
# dict
10 loops, best of 3: 50.7 ms per loop
# ifilter
10 loops, best of 3: 483 ms per loop
The final test looping over the whole rankings:
# your code
1 loops, best of 3: 2min 40s per loop
# dict
10 loops, best of 3: 67 ms per loop
# ifilter
1 loops, best of 3: 1min 3s per loop
So you can see as we loop over more rankings the dict option is by far the most efficient as far as runtime goes and will scale extremely well.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With