This is my first ever post on stackoverflow and am I am total fresher to coding. So, please bear with me.
I am working on an experiment which has two sets of data documents. Doc1 is as follows:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
TOPIC:topic_2 ....
.....
.....
TOPIC:topic_3 1066.0
say 0.062
word 0.182
and so on till 100 topics.
In this document, there are words that are either present in all the topics or just present in few topics. So, I would like to perform a process where if a word is not present in one topic, I would like to have the word's value in that topic as 0. That is the word BBC is present in topic 2, but is not there in topic 1, so I would like to have my list as :
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
Mr 0
s 0
president 0
tell 0
BBC 0
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
site 0
Internet 0
online 0
web 0
say 0
image 0
I have to multiply these values with another set of values present in another document. For that,
from collections import defaultdict
from itertools import groupby, imap
d = defaultdict(list)
with open("doc1") as f,open("doc2") as f2:
values = map(float, f2.read().split())
for line in f:
if line.strip() and not line.startswith("TOPIC"):
name, val = line.split()
d[name].append(float(val))
for k,v in d.items():
print("Prob for {} is {}".format(k ,sum(i*j for i, j in zip(v,values)) ))
My doc2 is of the format:
0.566667 0.0333333 0.133333 0 0 0 2.43333 0 0.13333......... till 100 values.
The above code considers the word "say". It checks that the word is in 3 topics, and gathers their values in a list like [0.015, 0.45, 0.062]. This list is multiplied from values in doc2 in such a way that the value 0.015 is multiplied to the 0th value in doc2, 0.45 * 1st value in doc2 and 0.062*2nd value in doc2. But this is not what I want. We can see that there is no word "SAY" in topic_2. Here the list must contain [0.015, 0.45, 0, 0.062]. So, when these values are multiplied with their respective position values from doc2, they would give
P(SAY) = (0.566667*0.015) + (0.0333333*0.045) + (0.133333 *0) + (0*0.062)
So, the code is perfectly fine but just this modification is required.
The issue is you are treating the TOPICS as all one, if you want individual sections use the groupby code from the original answer getting a set of all names first then comparing the set of names against the defualtdict keys to find the difference in each section:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
# find every word in every TOPIC
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0) # rset pointer
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
# get difference in all_words vs words in current TOPIC
# giving 0 as default for missing values
for word in all_words - d.viewkeys():
d[word] = 0
for k,v in d.iteritems():
print("Prob for {} is {}".format(k,v))
d = defaultdict(float)
To store all the output you can add the dicts to a list:
from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
for word in all_words - d.viewkeys():
d[word] = 0
out.append(d)
d = defaultdict(float)
Then iterate over the list:
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
Or forget the defualtdict and use dict.fromkeys:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = [line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")]
f.seek(0)
out, d = [], dict.fromkeys(all_words ,0.0)
# lambda x: not(x.strip()) will split into groups on the empty lines
for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d[name] += (float(val) * f)
out.append(d)
d = dict.fromkeys(all_words ,0)
If you always want the missing words at the end use a collections.OrderedDict with the first approach adding missing values at the end of the dict:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = []
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v)
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
d.setdefault(name, (float(val) * f))
for word in all_words.difference(d):
d[word] = 0
out.append(d)
d = OrderedDict()
for top in out:
for k,v in top.iteritems():
print("Prob for {} is {}".format(k,v))
Finally to store in order and by topic:
from collections import OrderedDict
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
out = OrderedDict()
# lambda x: not(x.strip()) will split into groups on the empty lines
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic = next(v).rstrip()
# create OrderedDict for each topic
out[topic] = OrderedDict()
# get matching float from values
f = next(values)
# iterate over the group
for s in v:
name, val = s.split()
out[topic].setdefault(name, (float(val) * f))
# find words missing from TOPIC and set to 0
for word in all_words.difference(out[topic]):
out[topic][word] = 0
for k,v in out.items():
print(k) # each TOPIC
for k,v in v.iteritems():
print("Prob for {} is {}".format(k,v)) # the OrderedDict items
print("\n")
doc1:
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
doc2:
0.345 0.566667
Output:
TOPIC:topic_0 5892.0
Prob for site is 0.0128233197556
Prob for Internet is 0.00901731160895
Prob for online is 0.00790478615073
Prob for web is 0.00755346232181
Prob for say is 0.00550407331974
Prob for image is 0.00521130346231
Prob for BBC is 0
Prob for Mr is 0
Prob for s is 0
Prob for president is 0
Prob for tell is 0
TOPIC:topic_1 12366.0
Prob for Mr is 0.085187930859
Prob for s is 0.0293277438137
Prob for say is 0.0255701266375
Prob for president is 0.00870667394471
Prob for tell is 0.0076985327511
Prob for BBC is 0.0076985327511
Prob for web is 0
Prob for image is 0
Prob for online is 0
Prob for site is 0
Prob for Internet is 0
You can apply the exact same logic using a regular for loop, the groupby just does all the grouping work for you.
If you actually just want to write to a file then the code even simpler:
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2,open("prob.txt","w") as f3:
values = imap(float, f2.read().split())
all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
f.seek(0)
for (k, v) in groupby(f, key=lambda x: not(x.strip())):
if not k:
topic, words = next(v), []
flt = next(values)
f3.write(topic)
for s in v:
name, val = s.split()
words.append(name)
f3.write("{} {}\n".format(name, (float(val) * flt)))
for word in all_words.difference(words):
f3.write("{} {}\n".format(word, 0))
f3.write("\n")
prob.txt:
TOPIC:topic_0 5892.0
site 0.0128233197556
Internet 0.00901731160895
online 0.00790478615073
web 0.00755346232181
say 0.00550407331974
image 0.00521130346231
BBC 0
Mr 0
s 0
president 0
tell 0
TOPIC:topic_1 12366.0
Mr 0.085187930859
s 0.0293277438137
say 0.0255701266375
president 0.00870667394471
tell 0.0076985327511
BBC 0.0076985327511
web 0
image 0
online 0
site 0
Internet 0
As an alternative concise way for rewriting the blocks you can store all the names in a set then crate the relative OrderedDict
of your blocks then get the missed names using set.difference
with main words (the set words
) for each block, then write them at the end of block :
from itertools import tee
from collections import OrderedDict
d=OrderedDict()
with open('input.txt') as f,open('new','w') as new:
f2,f3,f=tee(f,3)
next(f3)
words={line.split()[0] for line in f if not line.startswith('TOPIC') and line.strip()}
for line in f2:
if line.startswith('TOPIC'):
key=line
next_line=next(f3)
try:
while not next_line.startswith('TOPIC'):
d.setdefault(key,[]).append(next_line)
next_line=next(f3)
except:
pass
for k,v in d.items():
block_words={line.split()[0] for line in v if line.strip()}
insec=words.difference(block_words)
new.writelines([k]+v+['{} {}\n'.format(i,0) for i in insec])
Result :
TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
president 0
s 0
BBC 0
tell 0
Mr 0
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398web 0
image 0
online 0
site 0
Internet 0
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With