对于基于一个或多个公共列合并多个文件(甚至 > 2),python 中最好和最有效的方法之一是使用“brewery”。您甚至可以指定需要考虑合并哪些字段以及需要保存哪些字段。
import brewery
from brewery
import ds
import sys
sources = [
{"file": "grants_2008.csv",
"fields": ["receiver", "amount", "date"]},
{"file": "grants_2009.csv",
"fields": ["id", "receiver", "amount", "contract_number", "date"]},
{"file": "grants_2010.csv",
"fields": ["receiver", "subject", "requested_amount", "amount", "date"]}
]
创建所有字段的列表并添加文件名以存储有关数据记录来源的信息。通过源定义并收集字段:
for source in sources:
for field in source["fields"]:
if field not in all_fields:
out = ds.CSVDataTarget("merged.csv")
out.fields = brewery.FieldList(all_fields)
out.initialize()
for source in sources:
path = source["file"]
# Initialize data source: skip reading of headers
# use XLSDataSource for XLS files
# We ignore the fields in the header, because we have set-up fields
# previously. We need to skip the header row.
src = ds.CSVDataSource(path,read_header=False,skip_rows=1)
src.fields = ds.FieldList(source["fields"])
src.initialize()
for record in src.records():
# Add file reference into ouput - to know where the row comes from
record["file"] = path
out.append(record)
# Close the source stream
src.finalize()
cat merged.csv | brewery pipe pretty_printer