您可以遍历所有参与者并保存一个包含每行列的临时数组。然后,您可以根据需要显示它们。这是一个例子:
import textwrap
import os
from bs4 import BeautifulSoup
fname = "test.html"
with open(fname, 'r') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div',class_='content_part hid', id='article_qanda')
n=-1
rows = []
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
n = n + 1
rows.append([name.text])
elif name is None:
rows[n].append(p.text)
# now print all the rows
for r in rows:
if len(r) > 1:
# here you can display them as you wish.
# r[0] contains the "strong" tag
# r[1] contains the next "p" tag
print("%s => %s" % (r[0], r[1]))
else:
# here you have only the "strong" tag
print(r[0])
编辑:
我class_='content_part hid',从 中soup.find删除,删除了一个循环并添加了多进程部分,您可以在此处找到有关多进程的信息:
import os
from bs4 import BeautifulSoup
import multiprocessing as mp
def process(filename):
if filename.endswith('.html'):
fname = os.path.join(directory,filename)
with open(fname,errors='ignore') as f:
soup = BeautifulSoup(f.read(),'html.parser')
participants = soup.find('div', id='article_qanda')
if not participants:
return
for p in participants:
name = p.find("strong")
if name is not None and str(name) != "-1":
print()
print(name.text + " => ", end='')
elif name is None:
print(p.text, end=' ')
directory ='.'
if __name__ == '__main__':
p = mp.Pool()
p.map(process, os.listdir(directory))