1

我已将 os.walk 包含在其中,但它仍然不会输出 .csv,而且我不确定它是否正在读取 html 目录。我有一个离线网站目录,我需要抓取这些信息(例如 url、电子邮件、姓名、电话等)并输出到 .csv。当我运行它时(我知道它远非可执行),它挂在第 13 行,出现权限被拒绝错误。

import os, csv
from bs4 import BeautifulSoup


def main(folder, outputfile):
    with open(outputfile, "wb") as f:
        w = csv.writer(f)
        header = ("Tag", "Name", "Name", "Email", "Phone", "Location", "URL")
        w.writerow(header)
        rows = crawlhtmls(folder)
        w.writerows(rows)

def crawlhtmls(folder):
        for root, dirs, files in os.walk(folder):
    for f in files:
        if f.lower().endswith(".html"):
        soup = BeautifulSoup(f.read())
        events = soup.findAll('div', attrs={'class': 'post'})
        headline = x.find('h2')
        name = x.find('')
        email = x.find('address') 
        phone = x.find('tel')
        description = x.find('div', attrs={'class': 'entry'})

        headline2 = str(headline)
        name2 = str(name)
        email2 = str(name)
        phone2 = str(phone)

        description2 = str(description)

        headline3 = headline2.replace(",", " -")
        name3 = name2.replace(",", " -")
        email3 = email2.replace(",", " -")
        phone3 = phone2.replace(",", " -")
        description3 = description2.replace(",", " -")

        headline4 = headline3.replace('<h2 class', "")
        headline5 = headline4.replace('</h2>', "")
        headline6 = headline5.replace('- ', "")
        headline7 = headline6.replace("at ", "")

        description4 = description3.replace('[<p>', "")
        description5 = description4.replace('</p>]', "")
        description6 = description5.replace('\n', " ")
        description7 = description6.replace('[]', "")

        link4 = link3.replace('<a href', "")
        link5 = link4.replace('</a>', "")
        link6 = link5.replace('h2', " ")
        link7 = link6.replace('=', "")

        seq = (headline7, name3, email3, phone3, descripton7)
        yield seq

if __name__ == "__main__":
    folderPath = r"C:\projects\training\html" 
    output = r"C:\projects\training\about.csv" 
    main(folderPath, output)
4

0 回答 0