I need help to make the snippet below. I need to merge two files and performs computation on matched lines
I have oldFile.txt which contains old data and newFile.txt with an updated sets of data.
I need to to update the oldFile.txt based on the data in the newFile.txt and compute the changes in percentage. Any idea will be very helpful. Thanks in advance
from collections import defaultdict
num = 0
data=defaultdict(int)
with open("newFile.txt", encoding='utf8', errors='ignore') as f:
for line in f:
grp, pname, cnt, cat = line.split(maxsplit=3)
data[(pname.strip(),cat.replace('\n','').strip(),grp,cat)]+=int(cnt)
sorteddata = sorted([[k[0],v,k[1],k[2]] for k,v in data.items()], key=lambda x:x[1], reverse=True)
for subl in sorteddata[:10]:
num += 1
line = " ".join(map(str, subl))
print ("{:>5} -> {:>}".format(str(num), line))
with open("oldFile.txt", 'a', encoding='utf8', errors='ignore') as l:
l.write(" ".join(map(str, subl)) + '\n')
oldFile.txt
#col1 #col2 #col3 #col4
1,396 c15e89f2149bcc0cbd5fb204 4 HUH_Token (HUH)
279 9e4d81c8fc15870b15aef8dc 3 BABY BNB (BBNB)
231 31b5c07636dab8f0909dbd2d 6 Buff Unicorn (BUFFUN...)
438 1c6bc8e962427deb4106ae06 8 Charge (Charge)
2,739 6ea059a29eccecee4e250414 2 MAXIMACASH (MAXCAS...)
newFile.txt #-- updated data with additional lines not found in oldFile.txt
#col1 #col2 #col3 #col4
8,739 6ea059a29eccecee4e250414 60 MAXIMACASH (MAXCAS...)
138 1c6bc8e962427deb4106ae06 50 Charge (Charge)
860 31b5c07636dab8f0909dbd2d 40 Buff Unicorn (BUFFUN...)
200 9e4d81c8fc15870b15aef8dc 30 BABY BNB (BBNB) #-- not found in the oldFile.txt
20 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA)
1,560 c15e89f2149bcc0cbd5fb204 20 HUH_Token (HUH)
Need Improvement: #-- With additional columns (col5, col6) and sorted based on (col3) values
#col1 #col2 #col3 #col4 #col5 (oldFile-newFile) #col6 (oldFile-newFile)
8,739 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 % (col3 2-60) 219.06 % (col1 2,739-8,739)
138 1c6bc8e962427deb4106ae06 58 Charge (Charge) 625.00 % (col3 8-50) -68.49 % (col1 438-138)
860 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 % (col3 6-40) 272.29 % (col1 231-860)
200 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 % (col3 3-30) -28.32 % (col1 279-200)
20 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 0.00 % (col3 0-30) 20.00 % (col1 0-20)
1,560 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 % (col3 4-20) 11.75 % (col1 1,396-1,560)
To merge all CSV files, use the GLOB module. The os. path. join() method is used inside the concat() to merge the CSV files together.
Python Program to merge two files into a third file 1 Open file1.txt and file2.txt in read mode. 2 Open file3.txt in write mode. 3 Read the data from file1 and add it in a string. 4 Read the data from file2 and concatenate the data of this file to the previous string. 5 Write the data from string to file3 6 Close all the files
Open the third file in the “WRITE” mode. Firstly, Read data from the first file and store it as a string. Secondly, Read data from the second file and perform string concatenation. Close all files and finally check the file into which the merging is done, for the successful merging or not.
A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. We can join, merge, and concat dataframe using different methods. In Dataframe df.merge (), df.join (), and df.concat () methods help in joining, merging and concating different dataframe.
The above approach can be shortened using for loop. The following are steps to merge. Create a list containing filenames. Open the file3 in write mode. Iterate through the list and open each file in read mode. Read the data from files and simultaneously write the data in file3.
Here is a sample code to output what you need.
I use the formula below to calculate pct change.
percentage_change = 100*(new-old)/old
If old is 0 it is changed to 1 to avoid division by zero error.
import pandas as pd
def read_file(fn):
"""
Read file fn and convert data into a dict of dict.
data = {pname1: {grp: grp1, pname: pname1, cnt: cnt1, cat: cat1},
pname2: {gpr: grp2, ...} ...}
"""
data = {}
with open(fn, 'r') as f:
for lines in f:
line = lines.rstrip()
grp, pname, cnt, cat = line.split(maxsplit=3)
data.update({pname: {'grp': float(grp.replace(',', '')), 'pname': pname, 'cnt': int(cnt), 'cat': cat}})
return data
def process_data(oldfn, newfn):
"""
Read old and new files, update the old file based on new file.
Save output to text, and csv files.
"""
# Get old and new data in dict.
old = read_file(oldfn)
new = read_file(newfn)
# Update old data based on new data
u_data = {}
for ko, vo in old.items():
if ko in new:
n = new[ko]
# Update cnt.
old_cnt = vo['cnt']
new_cnt = n['cnt']
u_cnt = old_cnt + new_cnt
# cnt change, if old is zero we set it to 1 to avoid division by zero error.
tmp_old_cnt = 1 if old_cnt == 0 else old_cnt
cnt_change = 100 * (new_cnt - tmp_old_cnt) / tmp_old_cnt
# grp change
old_grp = vo['grp']
new_grp = n['grp']
grp_change = 100 * (new_grp - old_grp) / old_grp
u_data.update({ko: {'grp': n['grp'], 'pname': n['pname'], 'cnt': u_cnt, 'cat': n['cat'],
'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)}})
# add new data to u_data, that is not in old data
for kn, vn in new.items():
if kn not in old:
# Since this is new item its old cnt is zero, we set it to 1 to avoid division by zero error.
old_cnt = 1
new_cnt = vn['cnt']
cnt_change = 100 * (new_cnt - old_cnt) / old_cnt
# grp change is similar to cnt change
old_grp = 1
new_grp = vn['grp']
grp_change = 100 * (new_grp - old_grp) / old_grp
# Update new columns.
vn.update({'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)})
u_data.update({kn: vn})
# Create new data mydata list from u_data, and only extract the dict value.
mydata = []
for _, v in u_data.items():
mydata.append(v)
# Convert mydata into pandas dataframe to easier manage the data.
df = pd.DataFrame(mydata)
df = df.sort_values(by=['cnt'], ascending=False) # sort on cnt column
# Save to csv file.
df.to_csv('output.csv', index=False)
# Save to text file.
with open('output.txt', 'w') as w:
w.write(f'{df.to_string(index=False)}')
# Print in console.
print(df.to_string(index=False))
# Start
oldfn = 'F:/Tmp/oldFile.txt'
newfn = 'F:/Tmp/newFile.txt'
process_data(oldfn, newfn)
Console output:
grp pname cnt cat cnt_change% grp_change%
8739.0 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 219.06
138.0 1c6bc8e962427deb4106ae06 58 Charge (Charge) 525.00 -68.49
860.0 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 -28.32
20.0 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 2900.00 1900.00
1560.0 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 11.75
text output:
grp pname cnt cat cnt_change% grp_change%
8739.0 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 219.06
138.0 1c6bc8e962427deb4106ae06 58 Charge (Charge) 525.00 -68.49
860.0 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 -28.32
20.0 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 2900.00 1900.00
1560.0 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 11.75
csv output:
grp,pname,cnt,cat,cnt_change%,grp_change%
8739.0,6ea059a29eccecee4e250414,62,MAXIMACASH (MAXCAS...),2900.0,219.06
138.0,1c6bc8e962427deb4106ae06,58,Charge (Charge),525.0,-68.49
860.0,31b5c07636dab8f0909dbd2d,46,Buff Unicorn (BUFFUN...),566.67,272.29
200.0,9e4d81c8fc15870b15aef8dc,33,BABY BNB (BBNB),900.0,-28.32
20.0,5esdsds2sd15870b15aef8dc,30,CharliesAngel (CA),2900.0,1900.0
1560.0,c15e89f2149bcc0cbd5fb204,24,HUH_Token (HUH),400.0,11.75
Just to give a convtools based alternative solution, where you may find useful pieces:
from convtools import conversion as c
from convtools.contrib.tables import Table
# your percentage change calculation
def c_change(column_name):
return c.if_(
c.and_(
c.col(f"{column_name}_LEFT"),
c.col(f"{column_name}_RIGHT").is_not(None),
),
(
(c.col(f"{column_name}_RIGHT") - c.col(f"{column_name}_LEFT"))
/ c.col(f"{column_name}_LEFT")
* 100.0
).pipe(round, 2),
None,
)
prepare_columns = {
"COLUMN_0": c.col("COLUMN_0").as_type(float),
"COLUMN_2": c.col("COLUMN_2").as_type(float),
}
dialect = Table.csv_dialect(delimiter="\t")
sorted_rows = sorted(
Table.from_csv("tmp1.csv", dialect=dialect)
.update(**prepare_columns)
.join(
Table.from_csv(
"tmp2.csv",
dialect=dialect,
).update(**prepare_columns),
on=["COLUMN_1", "COLUMN_3"],
how="full",
)
.update(
COLUMN_4=c_change("COLUMN_2"),
COLUMN_5=c_change("COLUMN_0"),
COLUMN_2=c.col("COLUMN_2_RIGHT"),
COLUMN_0=c.col("COLUMN_0_RIGHT"),
)
.take(
"COLUMN_0",
"COLUMN_1",
"COLUMN_2",
"COLUMN_3",
"COLUMN_4",
"COLUMN_5",
)
.into_iter_rows(tuple),
key=lambda row: row[2],
reverse=True,
)
Table.from_rows(sorted_rows).into_csv("tmp_result.csv", dialect=dialect)
Results in:
COLUMN_0 COLUMN_1 COLUMN_2 COLUMN_3 COLUMN_4 COLUMN_5
8739.0 6ea059a29eccecee4e250414 60.0 MAXIMACASH (MAXCAS...) 2900.0 219.06
138.0 1c6bc8e962427deb4106ae06 50.0 Charge (Charge) 525.0 -68.49
860.0 31b5c07636dab8f0909dbd2d 40.0 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 30.0 BABY BNB (BBNB) 900.0 -28.32
20.0 5esdsds2sd15870b15aef8dc 30.0 CharliesAngel (CA)
1560.0 c15e89f2149bcc0cbd5fb204 20.0 HUH_Token (HUH) 400.0 11.75
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With