如果您知道数据结构应该是什么样子,那么您就不能忘记前 3 行并从其余行中提取数据吗?例如,假设表位于文本文件table_file
中,则
table_data = {'Serial Number':[],
'Name':{'First': [],
'Middle': []
'Last': []},
'Marks': []}
with open(table_file, 'r') as table:
# skip first 3 rows
for _ in range(3):
table.next()
for row in table:
row = row.strip('\n').split('|')
values = [r.strip() for r in row if r != '']
assert len(values) == 5
table_data['Serial Number'].append(int(values[0]))
table_data['Name']['First'].append(values[1])
table_data['Name']['Middle'].append(values[2])
table_data['Name']['Last'].append(values[3])
table_data['Marks'].append(values[4])
编辑:
要构造 table_data 字典,请考虑以下伪代码。公平警告,我对此进行了测试,它似乎适用于您的示例,并且应该适用于具有两行标题的任何内容。然而,这是草率的,因为我写了大约 10 分钟。但是,这可能是一个不错的开始,您可以从中改进和扩展。这还假设您有用于提取pos_list
和name_list
.
for itertools import tee, izip
def pairwise(iterable):
a, b = tee(iterable)
next(b, None)
return izip(a, b)
def create_table_dict(pos_list, name_list):
intervals = []
for sub_list in pos_list:
intervals.append(list(pairwise(sub_list)))
items = []
for interval, name in zip(intervals, name_list):
items.append([ (i, n) for i, n in zip(interval, name) ])
names = []
for int1, name1 in items[0]:
past_names = []
for int2, name2 in items[1]:
if int1[0] == int2[0]:
if int1[1] == int2[1]:
names.append(' '.join((name1, name2)).strip())
elif int2[1] < int1[1]:
past_names.append(name2)
elif int1[0] < int2[0]:
if int2[1] < int1[1]:
past_names.append(name2)
elif int1[1] == int2[1]:
names.append('{0}:{1}'.format(name1,
','.join(past_names + [name2])))
table = {}
for name in names:
if ':' not in name:
table[name] = []
else:
upper, nested = name.split(':')
nested = nested.split(',')
table[upper] = {}
for n in nested:
table[upper][n] = []
print table