我需要使用访问网页
twisted.web.client.getPage()
或从已知地址(即:www.google.com)下载网页的类似方法,问题是:我在代理服务器后面,我找不到任何关于如何配置扭曲或工厂使用我的解释代理,有什么想法吗?
请记住,我必须指定用户、密码、主机和端口。在我的 linux 机器上,我设置http_proxy
并https_proxy
http://user:pwd@ip:port
先感谢您。
我需要使用访问网页
twisted.web.client.getPage()
或从已知地址(即:www.google.com)下载网页的类似方法,问题是:我在代理服务器后面,我找不到任何关于如何配置扭曲或工厂使用我的解释代理,有什么想法吗?
请记住,我必须指定用户、密码、主机和端口。在我的 linux 机器上,我设置http_proxy
并https_proxy
http://user:pwd@ip:port
先感谢您。
from twisted.internet import reactor
from twisted.web import client
def processResult(page):
print "I got some data", repr(page)
reactor.callLater(0.1, reactor.stop)
def dealWithError(err):
print err.getErrorMessage()
reactor.callLater(0.1, reactor.stop)
class ProxyClientFactory(client.HTTPClientFactory):
def setURL(self, url):
client.HTTPClientFactory.setURL(self, url)
self.path = url
factory = ProxyClientFactory('http://url_you_want')
factory.deferred.addCallbacks(processResult, dealWithError)
reactor.connectTCP('proxy_address', 3142, factory)
reactor.run()
要使 nosklo 的解决方案正常工作,您需要为指示需要身份验证的“401”创建另一个处理程序。尝试这样的事情
def checkAuthError(self,failure,url):
failure.trap(error.Error)
if failure.value.status == '401':
username = raw_input("User name: ")
password = getpass.getpass("Password: ")
auth = base64.encodestring("%s:%s" %(username, password))
header = "Basic " + auth.strip()
return client.getPage(
url, headers={"Authorization": header})
else:
return failure
这将提示操作员在命令行中提供信息,或者您可以选择另一种方式提供用户名和密码。确保这是作为 Errback 添加的第一个处理程序,然后再添加任何其他处理程序,甚至是回调。这也需要更多的进口;'base64'、'getpass' 和 'error' 用于命令行提示。
我不得不使用基本身份验证做类似的事情,因为身份验证请求的示例代码在这里不起作用是一个有效的版本:
import base64
from twisted.internet import defer, reactor
from twisted.web import client, error, http
from ubuntuone.devtools.testcases.squid import SquidTestCase
# ignore common twisted lint errors
# pylint: disable=C0103, W0212
class ProxyClientFactory(client.HTTPClientFactory):
"""Factory that supports proxy."""
def __init__(self, proxy_url, proxy_port, url, headers=None):
self.proxy_url = proxy_url
self.proxy_port = proxy_port
client.HTTPClientFactory.__init__(self, url, headers=headers)
def setURL(self, url):
self.host = self.proxy_url
self.port = self.proxy_port
self.url = url
self.path = url
class ProxyWebClient(object):
"""Provide useful web methods with proxy."""
def __init__(self, proxy_url=None, proxy_port=None, username=None,
password=None):
"""Create a new instance with the proxy settings."""
self.proxy_url = proxy_url
self.proxy_port = proxy_port
self.username = username
self.password = password
def _process_auth_error(self, failure, url, contextFactory):
"""Process an auth failure."""
# we try to get the page using the basic auth
failure.trap(error.Error)
if failure.value.status == str(http.PROXY_AUTH_REQUIRED):
auth = base64.b64encode('%s:%s' % (self.username, self.password))
auth_header = 'Basic ' + auth.strip()
factory = ProxyClientFactory(self.proxy_url, self.proxy_port, url,
headers={'Proxy-Authorization': auth_header})
# pylint: disable=E1101
reactor.connectTCP(self.proxy_url, self.proxy_port, factory)
# pylint: enable=E1101
return factory.deferred
else:
return failure
def get_page(self, url, contextFactory=None, *args, **kwargs):
"""Download a webpage as a string.
This method relies on the twisted.web.client.getPage but adds and extra
step. If there is an auth error the method will perform a second try
so that the username and password are used.
"""
scheme, _, _, _ = client._parse(url)
factory = ProxyClientFactory(self.proxy_url, self.proxy_port, url)
if scheme == 'https':
from twisted.internet import ssl
if contextFactory is None:
contextFactory = ssl.ClientContextFactory()
# pylint: disable=E1101
reactor.connectSSL(self.proxy_url, self.proxy_port,
factory, contextFactory)
# pylint: enable=E1101
else:
# pylint: disable=E1101
reactor.connectTCP(self.proxy_url, self.proxy_port, factory)
# pylint: enable=E1101
factory.deferred.addErrback(self._process_auth_error, url,
contextFactory)
return factory.deferred
我们选择使用http_proxy
环境变量。我们遇到了重定向不总是被拾取的麻烦,或者更确切地说是以正确的方式被拾取。也就是说,nosklo
'的回应真的很有帮助!
import os
from twisted.web import client
class ProxyClientFactory(client.HTTPClientFactory):
def setURL(self, url):
'''More sensitive to redirects that can happen, that
may or may not be proxied / have different proxy settings.'''
scheme, host, port, path = client._parse(url)
proxy = os.environ.get('%s_proxy' % scheme)
if proxy:
scheme, host, port, path = client._parse(proxy)
self.scheme = scheme
self.host = host
self.port = port
self.path = url
self.url = url
else:
client.HTTPClientFactory.setURL(self, url)
factory = ProxyClientFactory(url)
# Callback configuration
# If http_proxy or https_proxy, or whatever appropriate proxy
# is set, then we should try to honor that. We do so simply
# by overriding the host/port we'll connect to. The client
# factory, BaseRequestServicer takes care of the rest
scheme, host, port, path = client._parse(url)
proxy = os.environ.get('%s_proxy' % scheme)
if proxy:
scheme, host, port, path = client._parse(proxy)
reactor.connectTCP(host, port, factory)