python - python 抓取新闻文章中的 HTTPError 错误

Question

我正在尝试从新闻网站上抓取与冠状病毒相关的文章。但是，我得到HTTPError错误。其他新闻门户网站也会出现同样的错误。该代码适用于不同的网站。我在这篇文章中用类似的代码问了一个不同的问题。以前对类似问题的一些答案要求更改，但与 URL 一起user-agent插入后仍然无法正常工作。headers = {'User-Agent': 'Mozilla/5.0'}这可能是因为我没有正确使用代码。任何帮助将非常感激。

这是我使用的代码：

import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib

req_keywords = ['coronavirus', 'covid-19']

newspaper_base_url = 'https://thehimalayantimes.com/'
category = 'nepal'

def checkif_kw_exist(list_one, list_two):
    common_kw = set(list_one) & set(list_two)
    if len(common_kw) == 0: return False, common_kw
    else: return True, common_kw

def get_article_info(url):
    a = Article(url)
    try:
        a.download()
        a.parse()
        a.nlp()
        success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
        if success:
            return [url, a.publish_date, a.title, a.text]
        else: return False
    except:
        return False

output_file = "J:/B/output_nepal.csv"
if not os.path.exists(output_file):
    open(output_file, 'w').close() 

for index in range(1,3700,1):
    page_url = newspaper_base_url + '/' + category + '?page='+str(index)

    page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())

    primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})

    for tag in primary_tag:

        url = tag.find("a")
        url = newspaper_base_url + url.get('href')
        result = get_article_info(url)
        if result is not False:
            with open(output_file, 'a', encoding='utf-8') as f:
                writeFile = csv.writer(f)
                writeFile.writerow(result)
                f.close
        else: 
            pass

这是我得到的错误：

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-34-c9c043bb59fb> in <module>
     69     page_url = newspaper_base_url + '/' + category + '?page='+str(index)
     70 
---> 71     page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
     72 
     73     primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    530         for processor in self.process_response.get(protocol, []):
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 
    534         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    640         if not (200 <= code < 300):
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 
    644         return response

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    568         if http_err:
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 
    572 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    648 class HTTPDefaultErrorHandler(BaseHandler):
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 
    652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 403: Forbidden

python - python 抓取新闻文章中的 HTTPError 错误

0 回答 0

Related

Reference