Imagine I have a dictionary / hashtable of pairs of strings (keys) and their respective probabilities (values):
import numpy as np
import random
import uuid
# Creating the N vocabulary and M vocabulary
max_word_len = 20
n_vocab_size = random.randint(8000,10000)
m_vocab_size = random.randint(8000,10000)
def random_word():
return str(uuid.uuid4().get_hex().upper()[0:random.randint(1,max_word_len)])
# Generate some random words.
n_vocab = [random_word() for i in range(n_vocab_size)]
m_vocab = [random_word() for i in range(m_vocab_size)]
# Let's hallucinate probabilities for each word pair.
hashes = {(n, m): random.random() for n in n_vocab for m in m_vocab}
The hashes
hashtable will look something like this:
{('585F', 'B4867'): 0.7582038699473549,
('69', 'D98B23C5809A'): 0.7341569569849136,
('4D30CB2BF4134', '82ED5FA3A00E4728AC'): 0.9106077161619021,
('DD8F8AFA5CF', 'CB'): 0.4609114677237601,
...
}
Imagine that this is the input hashtable that I'll read from CSV file with the first and second column being the word pairs (keys) of the hashtable and the third column the probabilities
If I were to put the probabilities into some sort of numpy
matrix, I would have to do this from the hashtable:
n_words, m_words = zip(*hashes.keys())
probs = np.array([[hashes[(n, m)] for n in n_vocab] for m in m_vocab])
Is there another way to get the prob
into the |N| * |M| matrix from the hashtable without doing a nested loop through the m_vocab and n_vocab?
(Note: I'm creating random words and random probabilities here but imagine I have read the hash table from a file and it's read into that hashtable structure)
Assume both scenarios, where:
csv
file (@bunji's answer resolves this)It is important that the final matrix needs to be queryable, the following isn't desirable:
$ echo -e 'abc\txyz\t0.9\nefg\txyz\t0.3\nlmn\topq\t\0.23\nabc\tjkl\t0.5\n' > test.txt
$ cat test.txt
abc xyz 0.9
efg xyz 0.3
lmn opq .23
abc jkl 0.5
$ python
Python 2.7.10 (default, Jul 30 2016, 18:31:42)
[GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas as pd
>>> pt = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack().as_matrix()
>>> pt
array([[ 0.5, nan, 0.9],
[ nan, nan, 0.3],
[ nan, nan, nan]])
>>> pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
2
1 jkl opq xyz
0
abc 0.5 NaN 0.9
efg NaN NaN 0.3
lmn NaN NaN NaN
>>> df = pd.read_csv('test.txt', index_col=[0,1], header=None, delimiter='\t').unstack()
>>> df
2
1 jkl opq xyz
0
abc 0.5 NaN 0.9
efg NaN NaN 0.3
lmn NaN NaN NaN
>>> df['abc', 'jkl']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1617, in get_loc
return self._engine.get_loc(key)
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: ('abc', 'jkl')
>>> df['abc']['jkl']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 163, in pandas.index.IndexEngine.get_loc (pandas/index.c:4090)
KeyError: 'abc'
>>> df[0][2]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0
>>> df[0]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2055, in __getitem__
return self._getitem_multilevel(key)
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 2099, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1597, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Library/Python/2.7/site-packages/pandas/indexes/multi.py", line 1859, in _get_level_indexer
loc = level_index.get_loc(key)
File "/Library/Python/2.7/site-packages/pandas/indexes/base.py", line 2106, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 139, in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)
File "pandas/src/hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)
KeyError: 0
The resulting matrix/dataframe should be queryable, i.e. is able to do something like:
probs[('585F', 'B4867')] = 0.7582038699473549
I'm not sure if there is a way to completely avoid looping but I imagine it could be optimized by using itertools
:
import itertools
nested_loop_iter = itertools.product(n_vocab,m_vocab)
#note that because it iterates over n_vocab first we will need to transpose it at the end
probs = np.fromiter(map(hashes.get, nested_loop_iter),dtype=float)
probs.resize((len(n_vocab),len(m_vocab)))
probs = probs.T
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With