我正在使用在 Linux vps 服务器上运行的 Python 报纸库:
第一个问题与报纸网站有关,我试图解析其文章说我正在使用广告拦截器,因此他们不显示任何文章,因为他们希望我停用它。我显然没有运行它。似乎需要在请求的标头中添加一些内容,但我不确定是什么或如何。一些帮助将不胜感激。
第二个与我在运行脚本时遇到的一些错误有关。
错误是:
Traceback (most recent call last):
File "/usr/local/lib/python3.3/site-packages/PIL/ImageFile.py", line 100, in __init__
self._open()
File "/usr/local/lib/python3.3/site-packages/PIL/PngImagePlugin.py", line 514, in _open
self.png.crc(cid, s)
File "/usr/local/lib/python3.3/site-packages/PIL/PngImagePlugin.py", line 142, in crc
crc2 = i16(self.fp.read(2)), i16(self.fp.read(2))
File "/usr/local/lib/python3.3/site-packages/PIL/_binary.py", line 53, in i16be
return unpack(">H", c[o:o+2])[0]
struct.error: unpack requires a bytes object of length 2
如果我遗漏了什么,请告诉我,我可以更新。我的代码如下:
import newspaper
import pymysql
import time
def InitialiseDB():
db = pymysql.connect(host="localhost",# your host, usually localhost
user="xxx", # your username
passwd="yyyyyyyy", # your password
db="testdb", # name of the data base
autocommit=True, use_unicode=True, charset="utf8")
cur = db.cursor()
return cur
def CloseDB(cur):
cur.close()
def GetListOfMediaURLs(cur):
cur.execute("SELECT sourceid, sourcename, sourceurl FROM MediaSources limit 5")
rows = cur.fetchall();
return rows
def SaveArticleInDB(sourceid, cur, article):
article.download()
article.parse()
if ("ad-block" in article.text):
return
article.nlp()
print(article.keywords)
print(article.summary)
date = time.strftime("%c")
html = pymysql.escape_string(article.html)
authors = ', '.join('?' * len(article.authors))
movies = ', '.join('?' * len(article.movies))
keywords = ', '.join('?' * len(article.keywords))
cur.execute(
"""INSERT INTO FullArticlesImported (url, title, publishdate, text, html, authors, sourceid, topimage, movies, keywords, summary, importdate)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(article.url, article.title, article.publish_date, article.text, html, authors, sourceid, article.top_image, movies, keywords, article.summary, date))
cur = InitialiseDB()
mediasources = GetListOfMediaURLs(cur)
for mediasource in mediasources:
# Grab each article and store it in the db
print ("Processing... "+mediasource[1])
paper = newspaper.build(mediasource[2], memoize_articles=False)
# Extract the brand name, description for each url
print(paper.brand+" "+paper.description)
if paper.size()==0:
print("No of articles are 0!!!")
for article in paper.articles:
dir(article)
SaveArticleInDB(mediasource[0], cur, article)
CloseDB(cur)
quit()