Wednesday, 8 May 2024

Comparing Two files and their Headers using Pandas and Lists

import pandas

import os

import re

old_col_list=[]

Source_Folder_OldFiles = "./data//Old_Columns_Files"

new_col_list=[]

Source_Folder_NewFiles = "./data//New_Columns_Files"

############################################### Loop through old files###############       

for file in os.listdir(Source_Folder_OldFiles):

    if(re.search("000000",file) and (

       file.startswith("abc_Re_")

       or file.startswith("def_Re_") 

       or file.startswith("ghi_Re_")

       or file.startswith("jkl_Re_")

       or file.startswith("mno_Re_")

       or file.startswith("pqr_Re_")

       or file.startswith("stu_Re_")

       or file.startswith("vwx_Re_")

       or file.startswith("yz_Re_") ) 

    ):

        old_file_df=pandas.read_csv(".//data//Old_Columns_Files//"+file+"")

        print(file)

        old_col_list.append(old_file_df.columns)


print(old_col_list[0])

print(len(old_col_list))


############################################### Loop through New or current day files###############       

for file in os.listdir(Source_Folder_NewFiles):

    if(re.search("000000",file) and (

       file.startswith("abc_Re_")

       or file.startswith("def_Re_") 

       or file.startswith("ghi_Re_")

       or file.startswith("jkl_Re_")

       or file.startswith("mno_Re_")

       or file.startswith("pqr_Re_")

       or file.startswith("stu_Re_")

       or file.startswith("vwx_Re_")

       or file.startswith("yz_Re_") ) 

    ):

        new_file_df=pandas.read_csv(".//data//New_Columns_Files//"+file+"")

        print(file)

        new_col_list.append(new_file_df.columns)


        

##########################  Loop the logic for all files###########################


for x in range(0, 2):

    print(new_col_list[x])

    print(len(new_col_list))

    ##comparing the elements of lists, that the cols of old files with the cols of new current file

    result = [a == b for a, b in zip(old_col_list[x], new_col_list[x])]

    #print(all(result),result[0],result[1:])        

    #False False [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


    #find the matchig elements

    matches = [i for i in old_col_list[x] if i in new_col_list[x]]

    print(matches)


    #find the non matching elements and measure the len of list contain non match element

    no_matches = [j for j  in new_col_list[x] if j not in  old_col_list[x]]

    print(no_matches,len(no_matches))