这假设您的所有数据都可以放入内存中;如果没有,您将不得不尝试一次只加载一些文件集,甚至一次只加载两个文件。
它进行比较并将输出写入一个 summary.csv 文件,每对文件一行。
import csv
import glob
import os
import itertools
def get_data(fname):
"""
Load a .csv file
Returns a dict of {'exchange':float(price)}
"""
with open(fname, 'rb') as inf:
items = (row.split() for row in csv.reader(inf))
return {item[0]:float(item[1]) for item in items}
def do_compare(a_name, a_data, b_name, b_data):
"""
Compare two data files of {'key': float(value)}
Returns a list of
- the name of the first file
- the name of the second file
- the number of keys in A which are not in B
- the number of keys in B which are not in A
- the number of values in A less than the corresponding value in B
- the number of values in A equal to the corresponding value in B
- the number of values in A greater than the corresponding value in B
"""
a_keys = set(a_data.iterkeys())
b_keys = set(b_data.iterkeys())
unique_to_a = len(a_keys - b_keys)
unique_to_b = len(b_keys - a_keys)
lt,eq,gt = 0,0,0
pairs = ((a_data[key], b_data[key]) for key in a_keys & b_keys)
for ai,bi in pairs:
if ai < bi:
lt +=1
elif ai == bi:
eq += 1
else:
gt += 1
return [a_name, b_name, unique_to_a, unique_to_b, lt, eq, gt]
def main():
os.chdir('d:/tariff_compare')
# load data from csv files
data = {}
for fname in glob.glob("*.csv"):
data[fname] = get_data(fname)
# do comparison
files = data.keys()
files.sort()
with open('summary.csv', 'wb') as outf:
outcsv = csv.writer(outf)
outcsv.writerow(["File A", "File B", "Unique to A", "Unique to B", "A<B", "A==B", "A>B"])
for a,b in itertools.combinations(files, 2):
outcsv.writerow(do_compare(a, data[a], b, data[b]))
if __name__=="__main__":
main()
编辑: user1277476 说得很好;如果您通过交换对文件进行预排序(或者如果它们已经按排序顺序),则可以同时遍历所有文件,只保留内存中每个文件的当前行。
这将使您可以对每个交换条目进行更深入的比较 - 包含一个值的文件数,或者顶部或底部 N 个值等。