python - 提取强文本并跟随 p

Question

我已经编写了一个提取 div 的代码（见下文），但现在我想在一个列中显示所有“强”，在不同的列中显示以下文本（对于目录中的多个文件）。在保管箱中，我上传了一个示例：（https://www.dropbox.com/s/kbnal2pefih2ru4/test.html?dl=0）。

到目前为止，我的代码是：

import textwrap
import os
from bs4 import BeautifulSoup

directory ='C:/Research syntheses - Meta analysis/SeekingAlpha/Tests/'
for filename in os.listdir(directory):
    if filename.endswith('.html'):
        fname = os.path.join(directory,filename)
        with open(fname, 'r') as f:
            soup = BeautifulSoup(f.read(),'html.parser')
            participants = soup.find('div',class_='content_part hid', id='article_qanda')
        print(filename, participants)

所以我的输出需要是：在第 1 列中的所有强项和在第 2 列中的以下 p（有时不止一个）。我希望有一个人可以帮助我！

score 0 · Accepted Answer

使用@rxw 的代码，我在最终解决方案中进一步编辑了他的答案：

import textwrap
import os
from bs4 import BeautifulSoup
import pandas as pd
import textwrap
import os
from bs4 import BeautifulSoup

directory ='C:/Research syntheses - Meta analysis/Transcripts'
for filename in os.listdir(directory):
    if filename.endswith('.html'):
        fname = os.path.join(directory,filename)
        with open(fname,errors='ignore') as f:
            soup = BeautifulSoup(f.read(),'html.parser')
            participants = soup.find('div',class_='content_part hid', id='article_qanda')
            if not participants: continue
            n=-1
            rows = []
            for p in participants:
                name = p.find("strong")
                if name is not None and str(name) != "-1":
                    n = n + 1
                    rows.append([name.text])
                elif name is None:
                    rows[n].append(p.text)

                # now print all the rows
                for r in rows:
                    if len(r) > 1: 
                        # here you can display them as you wish.
                        #   r[0] contains the "strong" tag
                        #   r[1] contains the next "p" tag
                        print("%s => %s" % (r[0], r[1]))
                    else:
                        # here you have only the "strong" tag
                        print(r[0])

score 0 · Accepted Answer

您可以遍历所有参与者并保存一个包含每行列的临时数组。然后，您可以根据需要显示它们。这是一个例子：

import textwrap
import os
from bs4 import BeautifulSoup

fname = "test.html"
with open(fname, 'r') as f:
    soup = BeautifulSoup(f.read(),'html.parser')
    participants = soup.find('div',class_='content_part hid', id='article_qanda')

n=-1
rows = []
for p in participants:
    name = p.find("strong")
    if name is not None and str(name) != "-1":
        n = n + 1
        rows.append([name.text])
    elif name is None:
        rows[n].append(p.text)

# now print all the rows
for r in rows:
    if len(r) > 1: 
        # here you can display them as you wish.
        #   r[0] contains the "strong" tag
        #   r[1] contains the next "p" tag
        print("%s => %s" % (r[0], r[1]))
    else:
        # here you have only the "strong" tag
        print(r[0])

编辑：

我class_='content_part hid',从中soup.find删除，删除了一个循环并添加了多进程部分，您可以在此处找到有关多进程的信息：

import os
from bs4 import BeautifulSoup
import multiprocessing as mp

def process(filename):
    if filename.endswith('.html'):
       fname = os.path.join(directory,filename)
       with open(fname,errors='ignore') as f:
           soup = BeautifulSoup(f.read(),'html.parser')
           participants = soup.find('div', id='article_qanda')
       if not participants:
          return

       for p in participants:
           name = p.find("strong")
           if name is not None and str(name) != "-1":
               print()
               print(name.text + " => ", end='')
           elif name is None:
               print(p.text, end=' ')

directory ='.'

if __name__ == '__main__':
    p = mp.Pool()
    p.map(process, os.listdir(directory))

python - 提取强文本并跟随 p

2 回答 2

Related

Reference