我尝试通过搜索标签遍历 html 中的表,然后将找到的值更新到字典,然后将这些值写入 csv。输出当前适用于 url 和标题,但名称输出将为空白或显示“无”。但是,如果我打印 blog["name'] 的输出,它会正确提取我想要的信息。我怀疑这是一个缩进错误,但我不知道在哪里排列。我试过移动东西但似乎没有什么可以让名称分配在该循环内工作。
import os
from bs4 import BeautifulSoup
import my_csv_writer
def td_finder(tr, searchLabel):
value = ""
index = tr.text.find(searchLabel)
if index>-1:
tds = tr.findAll('td')
if len(tds)>1:
value = tds[1].text
return value
def main():
topdir = 'some_directory'
writer = my_csv_writer.CsvWriter("output.csv")
writer.writeLine(["url", "headline", "name"])
"""Main Function"""
blog = []
for root, dirs, files in os.walk(topdir):
for f in files:
url = os.path.join(root, f)
url = os.path.dirname(url).split('some_file')[1]
if f.lower().endswith((".html")):
file_new = open(os.path.join(root, f), "r").read()
soup = BeautifulSoup(file_new)
blog = {}
#Blog Title
blog["title"] = soup.find('title').text
for table in soup.findAll("table"):
for tr in table.findAll("tr"):
#name
blog["name"] = td_finder(tr, "name:")
seq = [url, unicode(blog["title"]), unicode(blog.get("name"))]
writer.writeLine(seq)
#return ""
if __name__ == '__main__':
main()
print "Finished main"