Merge two files and add computation and sorting the updated data in python

Tags:

python

sorting

I need help to make the snippet below. I need to merge two files and performs computation on matched lines

I have oldFile.txt which contains old data and newFile.txt with an updated sets of data.

I need to to update the oldFile.txt based on the data in the newFile.txt and compute the changes in percentage. Any idea will be very helpful. Thanks in advance

from collections import defaultdict
num = 0
data=defaultdict(int)
with open("newFile.txt", encoding='utf8', errors='ignore') as f:
    for line in f:
        grp, pname, cnt, cat = line.split(maxsplit=3)
        data[(pname.strip(),cat.replace('\n','').strip(),grp,cat)]+=int(cnt)
        
sorteddata = sorted([[k[0],v,k[1],k[2]] for k,v in data.items()], key=lambda x:x[1], reverse=True)

for subl in sorteddata[:10]:
    num += 1
    line = " ".join(map(str, subl))
    print ("{:>5} -> {:>}".format(str(num), line))

    with open("oldFile.txt", 'a', encoding='utf8', errors='ignore') as l:
        l.write(" ".join(map(str, subl)) + '\n')

oldFile.txt

 #col1             #col2        #col3  #col4
 1,396 c15e89f2149bcc0cbd5fb204   4    HUH_Token (HUH)                      
   279 9e4d81c8fc15870b15aef8dc   3    BABY BNB (BBNB)                
   231 31b5c07636dab8f0909dbd2d   6    Buff Unicorn (BUFFUN...)             
   438 1c6bc8e962427deb4106ae06   8    Charge (Charge)                      
 2,739 6ea059a29eccecee4e250414   2    MAXIMACASH (MAXCAS...)

newFile.txt #-- updated data with additional lines not found in oldFile.txt

 #col1             #col2        #col3  #col4
 8,739 6ea059a29eccecee4e250414   60   MAXIMACASH (MAXCAS...)
   138 1c6bc8e962427deb4106ae06   50   Charge (Charge)                      
   860 31b5c07636dab8f0909dbd2d   40   Buff Unicorn (BUFFUN...)             
   200 9e4d81c8fc15870b15aef8dc   30   BABY BNB (BBNB)    #-- not found in the oldFile.txt
    20 5esdsds2sd15870b15aef8dc   30   CharliesAngel (CA)            
 1,560 c15e89f2149bcc0cbd5fb204   20   HUH_Token (HUH)

Need Improvement: #-- With additional columns (col5, col6) and sorted based on (col3) values

 #col1             #col2        #col3      #col4                #col5 (oldFile-newFile)   #col6 (oldFile-newFile)
 8,739 6ea059a29eccecee4e250414  62   MAXIMACASH (MAXCAS...)   2900.00 % (col3 2-60)    219.06 % (col1 2,739-8,739) 
   138 1c6bc8e962427deb4106ae06  58   Charge (Charge)           625.00 % (col3 8-50)    -68.49 % (col1   438-138)      
   860 31b5c07636dab8f0909dbd2d  46   Buff Unicorn (BUFFUN...)  566.67 % (col3 6-40)    272.29 % (col1   231-860)
   200 9e4d81c8fc15870b15aef8dc  33   BABY BNB (BBNB)           900.00 % (col3 3-30)    -28.32 % (col1   279-200) 
    20 5esdsds2sd15870b15aef8dc  30   CharliesAngel (CA)          0.00 % (col3 0-30)     20.00 % (col1   0-20) 
 1,560 c15e89f2149bcc0cbd5fb204  24   HUH_Token (HUH)           400.00 % (col3 4-20)     11.75 % (col1 1,396-1,560)

323

asked Dec 07 '21 22:12

rbutrnz

2 Answers

Here is a sample code to output what you need. I use the formula below to calculate pct change. percentage_change = 100*(new-old)/old

If old is 0 it is changed to 1 to avoid division by zero error.

import pandas as pd


def read_file(fn):
    """
    Read file fn and convert data into a dict of dict.
    data = {pname1: {grp: grp1, pname: pname1, cnt: cnt1, cat: cat1},
            pname2: {gpr: grp2, ...} ...}
    """
    data = {}
    with open(fn, 'r') as f:
        for lines in f:
            line = lines.rstrip()
            grp, pname, cnt, cat = line.split(maxsplit=3)
            data.update({pname: {'grp': float(grp.replace(',', '')), 'pname': pname, 'cnt': int(cnt), 'cat': cat}})
            
    return data


def process_data(oldfn, newfn):  
    """
    Read old and new files, update the old file based on new file.
    Save output to text, and csv files.
    """
    # Get old and new data in dict.
    old = read_file(oldfn)
    new = read_file(newfn)

    # Update old data based on new data
    u_data = {}
    for ko, vo in old.items():
        if ko in new:
            n = new[ko]
            
            # Update cnt.
            old_cnt = vo['cnt']
            new_cnt = n['cnt']
            u_cnt = old_cnt + new_cnt

            # cnt change, if old is zero we set it to 1 to avoid division by zero error.
            tmp_old_cnt = 1 if old_cnt == 0 else old_cnt
            cnt_change = 100 * (new_cnt - tmp_old_cnt) / tmp_old_cnt

            # grp change
            old_grp = vo['grp']
            new_grp = n['grp']
            grp_change = 100 * (new_grp - old_grp) / old_grp

            u_data.update({ko: {'grp': n['grp'], 'pname': n['pname'], 'cnt': u_cnt, 'cat': n['cat'],
                                'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)}})

    # add new data to u_data, that is not in old data
    for kn, vn in new.items():
        if kn not in old:        
            # Since this is new item its old cnt is zero, we set it to 1 to avoid division by zero error.
            old_cnt = 1
            new_cnt = vn['cnt']
            cnt_change = 100 * (new_cnt - old_cnt) / old_cnt        

            # grp change is similar to cnt change
            old_grp = 1
            new_grp = vn['grp']
            grp_change = 100 * (new_grp - old_grp) / old_grp
            
            # Update new columns.
            vn.update({'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)})        
            u_data.update({kn: vn})
            
    # Create new data mydata list from u_data, and only extract the dict value.
    mydata = []
    for _, v in u_data.items():
        mydata.append(v)
        
    # Convert mydata into pandas dataframe to easier manage the data.
    df = pd.DataFrame(mydata)
    df = df.sort_values(by=['cnt'], ascending=False)  # sort on cnt column
    
    # Save to csv file.
    df.to_csv('output.csv', index=False)
    
    # Save to text file.
    with open('output.txt', 'w') as w:
        w.write(f'{df.to_string(index=False)}')
        
    # Print in console.    
    print(df.to_string(index=False))


# Start
oldfn = 'F:/Tmp/oldFile.txt'
newfn = 'F:/Tmp/newFile.txt'
process_data(oldfn, newfn)

Console output:

   grp                    pname  cnt                      cat  cnt_change%  grp_change%
8739.0 6ea059a29eccecee4e250414   62   MAXIMACASH (MAXCAS...)      2900.00       219.06
 138.0 1c6bc8e962427deb4106ae06   58          Charge (Charge)       525.00       -68.49
 860.0 31b5c07636dab8f0909dbd2d   46 Buff Unicorn (BUFFUN...)       566.67       272.29
 200.0 9e4d81c8fc15870b15aef8dc   33          BABY BNB (BBNB)       900.00       -28.32
  20.0 5esdsds2sd15870b15aef8dc   30       CharliesAngel (CA)      2900.00      1900.00
1560.0 c15e89f2149bcc0cbd5fb204   24          HUH_Token (HUH)       400.00        11.75

text output:

   grp                    pname  cnt                      cat  cnt_change%  grp_change%
8739.0 6ea059a29eccecee4e250414   62   MAXIMACASH (MAXCAS...)      2900.00       219.06
 138.0 1c6bc8e962427deb4106ae06   58          Charge (Charge)       525.00       -68.49
 860.0 31b5c07636dab8f0909dbd2d   46 Buff Unicorn (BUFFUN...)       566.67       272.29
 200.0 9e4d81c8fc15870b15aef8dc   33          BABY BNB (BBNB)       900.00       -28.32
  20.0 5esdsds2sd15870b15aef8dc   30       CharliesAngel (CA)      2900.00      1900.00
1560.0 c15e89f2149bcc0cbd5fb204   24          HUH_Token (HUH)       400.00        11.75

csv output:

grp,pname,cnt,cat,cnt_change%,grp_change%
8739.0,6ea059a29eccecee4e250414,62,MAXIMACASH (MAXCAS...),2900.0,219.06
138.0,1c6bc8e962427deb4106ae06,58,Charge (Charge),525.0,-68.49
860.0,31b5c07636dab8f0909dbd2d,46,Buff Unicorn (BUFFUN...),566.67,272.29
200.0,9e4d81c8fc15870b15aef8dc,33,BABY BNB (BBNB),900.0,-28.32
20.0,5esdsds2sd15870b15aef8dc,30,CharliesAngel (CA),2900.0,1900.0
1560.0,c15e89f2149bcc0cbd5fb204,24,HUH_Token (HUH),400.0,11.75

128

answered Oct 27 '22 20:10

ferdy

Just to give a convtools based alternative solution, where you may find useful pieces:

from convtools import conversion as c
from convtools.contrib.tables import Table

# your percentage change calculation
def c_change(column_name):
    return c.if_(
        c.and_(
            c.col(f"{column_name}_LEFT"),
            c.col(f"{column_name}_RIGHT").is_not(None),
        ),
        (
            (c.col(f"{column_name}_RIGHT") - c.col(f"{column_name}_LEFT"))
            / c.col(f"{column_name}_LEFT")
            * 100.0
        ).pipe(round, 2),
        None,
    )

prepare_columns = {
    "COLUMN_0": c.col("COLUMN_0").as_type(float),
    "COLUMN_2": c.col("COLUMN_2").as_type(float),
}
dialect = Table.csv_dialect(delimiter="\t")

sorted_rows = sorted(
    Table.from_csv("tmp1.csv", dialect=dialect)
    .update(**prepare_columns)
    .join(
        Table.from_csv(
            "tmp2.csv",
            dialect=dialect,
        ).update(**prepare_columns),
        on=["COLUMN_1", "COLUMN_3"],
        how="full",
    )
    .update(
        COLUMN_4=c_change("COLUMN_2"),
        COLUMN_5=c_change("COLUMN_0"),
        COLUMN_2=c.col("COLUMN_2_RIGHT"),
        COLUMN_0=c.col("COLUMN_0_RIGHT"),
    )
    .take(
        "COLUMN_0",
        "COLUMN_1",
        "COLUMN_2",
        "COLUMN_3",
        "COLUMN_4",
        "COLUMN_5",
    )
    .into_iter_rows(tuple),
    key=lambda row: row[2],
    reverse=True,
)

Table.from_rows(sorted_rows).into_csv("tmp_result.csv", dialect=dialect)

Results in:

COLUMN_0    COLUMN_1    COLUMN_2    COLUMN_3    COLUMN_4    COLUMN_5
8739.0  6ea059a29eccecee4e250414    60.0    MAXIMACASH (MAXCAS...)  2900.0  219.06
138.0   1c6bc8e962427deb4106ae06    50.0    Charge (Charge) 525.0   -68.49
860.0   31b5c07636dab8f0909dbd2d    40.0    Buff Unicorn (BUFFUN...)    566.67  272.29
200.0   9e4d81c8fc15870b15aef8dc    30.0    BABY BNB (BBNB) 900.0   -28.32
20.0    5esdsds2sd15870b15aef8dc    30.0    CharliesAngel (CA)      
1560.0  c15e89f2149bcc0cbd5fb204    20.0    HUH_Token (HUH) 400.0   11.75

answered Oct 27 '22 21:10

westandskif

Related questions
                            
                                Python Abstract class with concrete methods
                            
                                python - how to docstring kwargs and their expected types
                            
                                How to bulk write TFRecords?
                            
                                Render current status only on template in StreamingHttpResponse in Django
                            
                                Django OneToOneField default value
                            
                                Call an async function in an normal function
                            
                                How to generate random numbers to satisfy a specific mean and median in python?
                            
                                How to use SMOP to convert Matlab into Python code
                            
                                How do I use an AWS SessionToken to read from S3 in pyspark?
                            
                                How to run Keras.model() for prediction inside a tensorflow session?
                            
                                Python freezes on smtplib.SMTP("smtp.gmail.com", 587)
                            
                                How to use boto3 client with Python multiprocessing?
                            
                                Pytest: How to parametrize a test with a list that is returned from a fixture?
                            
                                Full gradient descent in keras
                            
                                Why does pytesseract fail to recognise digits from image with darker background?
                            
                                Python Google Cloud Function Logging Severity and Duplicates
                            
                                Flask admin remember form value
                            
                                Despite installing the torch vision pytorch library, I am getting an error saying that there is no module named torch vision
                            
                                Getting coordinates of the closest data point on matplotlib plot
                            
                                Google cloud storage python client AttributeError: 'ClientOptions' object has no attribute 'scopes' occurs after deployment

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With