我不知道您如何选择 Header1+Header2 或 Header1+Header3,... 由于表格必须相当小,我建议收集所有数据,然后才提取表格的所需子集. 以下代码显示了可能的解决方案:
import lxml.etree as ET
def parseTable(table_fragment):
header = None # init - only to create the variable (name)
rows = [] # init
# Parse the table with lxml (the standard xml.etree.ElementTree would be also fine).
tab = ET.fromstring(table_fragment)
for tr in tab:
lst = []
if header is None:
header = lst
else:
rows.append(lst)
for e in tr:
lst.append(e.text)
return header, rows
def extractColumns(header, rows, clst):
header2 = []
for i in clst:
header2.append(header[i - 1]) # one-based to zero-based
rows2 = []
for row in rows:
lst = []
rows2.append(lst)
for i in clst:
lst.append(row[i - 1]) # one-based to zero-based
return header2, rows2
def myRepr(header, rows):
out = [repr(tuple(header))] # init -- list with header
for row in rows:
out.append(repr(tuple(row))) # another row
return '[\n' + (',\n'.join(out)) + '\n]' # join to string
table_fragment = '''\
<table border="1">
<tr>
<td>Header_1</td>
<td>Header_2</td>
<td>Header_3</td>
<td>Header_4</td>
</tr>
<tr>
<td>row 1_cell 1</td>
<td>row 1_cell 2</td>
<td>row 1_cell 3</td>
<td>row 1_cell 4</td>
</tr>
<tr>
<td>row 2_cell 1</td>
<td>row 2_cell 2</td>
<td>row 1_cell 3</td>
<td>row 1_cell 4</td>
</tr>
<tr>
<td>row 3_cell 1</td>
<td>row 3_cell 2</td>
<td>row 1_cell 3</td>
<td>row 1_cell 4</td>
</tr>
<tr>
<td>row 4_cell 1</td>
<td>row 4_cell 2</td>
<td>row 1_cell 3</td>
<td>row 1_cell 4</td>
</tr>
</table>'''
# Parse the table
header, rows = parseTable(table_fragment)
# For debugging...
print header
print rows
# Collect the representations of the selections. The extractColumns()
# returns a tuple. The * expands it to two arguments.
lst = []
lst.append(myRepr(header, rows))
lst.append(myRepr(*extractColumns(header, rows, [1, 2])))
lst.append(myRepr(*extractColumns(header, rows, [1, 3])))
lst.append(myRepr(*extractColumns(header, rows, [1, 2, 4])))
# Write the output.
with open('output.txt', 'w') as f:
f.write('[\n')
f.write(',\n'.join(lst))
f.write('\n]')
output.txt 现在包含:
[
[
('Header_1', 'Header_2', 'Header_3', 'Header_4'),
('row 1_cell 1', 'row 1_cell 2', 'row 1_cell 3', 'row 1_cell 4'),
('row 2_cell 1', 'row 2_cell 2', 'row 1_cell 3', 'row 1_cell 4'),
('row 3_cell 1', 'row 3_cell 2', 'row 1_cell 3', 'row 1_cell 4'),
('row 4_cell 1', 'row 4_cell 2', 'row 1_cell 3', 'row 1_cell 4')
],
[
('Header_1', 'Header_2'),
('row 1_cell 1', 'row 1_cell 2'),
('row 2_cell 1', 'row 2_cell 2'),
('row 3_cell 1', 'row 3_cell 2'),
('row 4_cell 1', 'row 4_cell 2')
],
[
('Header_1', 'Header_3'),
('row 1_cell 1', 'row 1_cell 3'),
('row 2_cell 1', 'row 1_cell 3'),
('row 3_cell 1', 'row 1_cell 3'),
('row 4_cell 1', 'row 1_cell 3')
],
[
('Header_1', 'Header_2', 'Header_4'),
('row 1_cell 1', 'row 1_cell 2', 'row 1_cell 4'),
('row 2_cell 1', 'row 2_cell 2', 'row 1_cell 4'),
('row 3_cell 1', 'row 3_cell 2', 'row 1_cell 4'),
('row 4_cell 1', 'row 4_cell 2', 'row 1_cell 4')
]
]