18

我制作了一个供个人使用的python脚本,但它不适用于维基百科......

这项工作:

import urllib2, sys
from bs4 import BeautifulSoup

site = "http://youtube.com"
page = urllib2.urlopen(site)
soup = BeautifulSoup(page)
print soup

这不起作用:

import urllib2, sys
from bs4 import BeautifulSoup

site= "http://en.wikipedia.org/wiki/StackOverflow"
page = urllib2.urlopen(site)
soup = BeautifulSoup(page)
print soup

这是错误:

Traceback (most recent call last):
  File "C:\Python27\wiki.py", line 5, in <module>
    page = urllib2.urlopen(site)
  File "C:\Python27\lib\urllib2.py", line 126, in urlopen
    return _opener.open(url, data, timeout)
  File "C:\Python27\lib\urllib2.py", line 406, in open
    response = meth(req, response)
  File "C:\Python27\lib\urllib2.py", line 519, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Python27\lib\urllib2.py", line 444, in error
    return self._call_chain(*args)
  File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
    result = func(*args)
  File "C:\Python27\lib\urllib2.py", line 527, in http_error_default
    raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 403: Forbidden
4

3 回答 3

63

在当前代码中:

蟒蛇2.X

import urllib2, sys
from BeautifulSoup import BeautifulSoup

site= "http://en.wikipedia.org/wiki/StackOverflow"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
print soup

Python 3.X

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

site= "http://en.wikipedia.org/wiki/StackOverflow"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
print(soup)

带有 Selenium 的 Python 3.X(Javascript 函数执行)

from selenium import webdriver as driver

browser = driver.PhantomJS()
p = browser.get("http://en.wikipedia.org/wiki/StackOverflow")
assert "Stack Overflow - Wikipedia" in browser.title

修改版本有效的原因是因为 Wikipedia 检查 User-Agent 是否属于“流行浏览器”

于 2012-10-24T18:29:32.020 回答
0
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
url = 'yourlink'
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")

这对我有用,也应该对你有用!

于 2022-01-04T17:20:29.080 回答
0
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup


def checkURL(requested_url):
    if not urlparse(requested_url).scheme:
        requested_url = "https://" + requested_url
    return requested_url


def requestAndParse(requested_url):
    requested_url = checkURL(requested_url)
    try:
        # define headers to be provided for request authentication
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 
                        'AppleWebKit/537.11 (KHTML, like Gecko) '
                        'Chrome/23.0.1271.64 Safari/537.11',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'}
        request_obj = Request(url = requested_url, headers = headers)
        opened_url = urlopen(request_obj)
        page_html = opened_url.read()
        opened_url.close()
        page_soup = soup(page_html, "html.parser")
        return page_soup, requested_url

    except Exception as e:
        print(e)

# Exmaple:
page, url = requestAndParse(url)

试试上面的代码片段来beautifulsoup加载页面。

于 2022-02-07T12:19:50.603 回答