这不是一个命令行解决方案,而是一个有趣的脚本。
import csv
csv_reader = csv.reader(open('data.csv', 'rb'))
# Read first two rows of field text out as a prefix.
prefix = ' '.join(csv_reader.next()[2].strip() for i in range(2))
def collapsed_row_iter():
depth_value_list = []
for (_, field_id, field_text, _) in csv_reader:
# Count number of leading <SPACE> chars to determine depth.
pre_strip_text_len = len(field_text)
field_text = field_text.lstrip()
depth = pre_strip_text_len - len(field_text)
depth_value_list_len = len(depth_value_list)
if depth == depth_value_list_len + 1:
# Append a new depth value.
depth_value_list.append(field_text.rstrip())
if depth <= depth_value_list_len:
# Truncate list to depth, append new value.
del depth_value_list[depth:]
depth_value_list.append(field_text.rstrip())
else:
# Depth value is greater than current_depth + 1
raise ValueError
# Only yield the row if field_id value is non-NULL.
if field_id:
yield (field_id, '%s %s' % (prefix, ' '.join(depth_value_list)))
# Get CSV writer object, write the header.
csv_writer = csv.writer(open('collapsed.csv', 'wb'))
csv_writer.writerow(['FIELD', 'TEXT'])
# Iterate over collapsed rows, writing each to the output CSV.
for (field_id, collapsed_text) in collapsed_row_iter():
csv_writer.writerow([field_id, collapsed_text])
输出:
FIELD,TEXT
P0190001,RACE OF HOUSEHOLDER BY HOUSEHOLD TYPE (8) Universe: Households White Family Households: Married-couple family: With related children
P0190002,RACE OF HOUSEHOLDER BY HOUSEHOLD TYPE (8) Universe: Households White Family Households: Married-couple family: No related children
P0190003,"RACE OF HOUSEHOLDER BY HOUSEHOLD TYPE (8) Universe: Households White Family Households: Other family: Male householder, no wife present: With related children"
P0190004,"RACE OF HOUSEHOLDER BY HOUSEHOLD TYPE (8) Universe: Households White Family Households: Other family: Male householder, no wife present: No related children"