0

我写了一个代码来从一个特定的网站上抓取文章,这样我就可以把这个代码创建的 csv 放到 Geneea(文本分析程序)中。问题是我使用 unicode 编写了这段代码,但后来我意识到我需要用重音字符刮掉文本。这段代码为我提供了我需要的输出,但是对于文本分析程序来说,具有重音字符的文本至关重要。您对我如何更改此代码有任何建议吗?

非常感谢大家!

我的代码如下:

import requests
from bs4 import BeautifulSoup
import json
import csv
from unidecode import unidecode



def datetonumeric(stringdate):
    spliteddate=stringdate.split()
    data=(spliteddate[0]).split(".")
    day=int(data[0])
    themonth=int(data[1])
    year=int(data[2])
    if(themonth >10 or themonth < 7):
        return ""
    if(themonth==10 and day>7):
        return ""
    return f'{day}/{themonth}/{year}'



count=0
with open('parlamentnilistyoutput.csv', 'w', newline='', encoding="UTF-8") as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    spamwriter.writerow(['id_clanku','zdroj','datum','title','perex','text','url'])
    for i in range(20,70):
        r=requests.get(f'https://www.parlamentnilisty.cz/special/Volby%202021?p={i}')
        soup = BeautifulSoup(r.text, 'html.parser')
        articles=soup.select(".articles-list ul.list-unstyled li")
        for article in articles:

            try:
            # id=(json.loads(article['data-track-list']))['item']['id']
                id_clanku=f'PA000{count+1}'
                urlselector=(article.select("a"))
                url=f"https://www.parlamentnilisty.cz{(urlselector[0])['href']}"
                r=requests.get(url)
                soup=BeautifulSoup(r.text, 'html.parser')
                dateselector=soup.select('div.time')
                date=(dateselector[0]).get_text()
                date=datetonumeric(date)
                print(date)
                if(date!=""):
                    titleselector=soup.select('.article-header h1')
                    title=titleselector[0].get_text()
                    title=title.replace(","," ")
                    pretextselector=soup.select("p.brief")
                    pretext=pretextselector[0].get_text()
                    pretext=pretext.replace(","," ")
                    alltext=soup.select('.article-content > p')
                    maintext=""
                    for text in alltext:
                        maintext=maintext + (text.get_text()).replace("\n"," ")
                        maintext=maintext+"\n"
                    maintext=maintext.replace(","," ")
                    spamwriter.writerow([id_clanku,'parlamentnilisty',date,unidecode(title),unidecode(pretext),unidecode(maintext),url])
                    count=count+1
            except:
                print("wrong request")
            break
4

0 回答 0