6

我正在编写一个 python 程序,它使用 PyQT4 加载一些 URL 并处理它的内容/DOM(在它被 javascript 修改之后)。我还需要自定义标题来请求该页面。

下面的代码可以正常工作,只是它无法使用我用 QNetworkRequest 定义的自定义标头获取 URL。

    import sys
    import signal
    from optparse import OptionParser
    from PyQt4.QtCore import *
    from PyQt4.QtGui import *
    from PyQt4.QtWebKit import QWebPage
    from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply

    class MyNetworkAccessManager(QNetworkAccessManager):
        def __init__(self, url):
            QNetworkAccessManager.__init__(self)
            self.request = QNetworkRequest(QUrl(url))
            self.request.setRawHeader('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US)')
            self.request.setRawHeader("Accept-Language","en-us,en;q=0.5");
            self.request.setRawHeader("Accept-Charset","ISO-8859-1,utf-8;q=0.7,*;q=0.7");
            self.request.setRawHeader("Connection","keep-alive");      
            self.reply = self.get(self.request)

        def createRequest(self, operation, request, data):
            print "mymanager handles ", request.url()
            return QNetworkAccessManager.createRequest( self, operation, request, data )

    class Crawler( QWebPage ):
        def __init__(self, url, file):
            QWebPage.__init__( self )
            self._url = url
            self._file = file
            manager = MyNetworkAccessManager(url)
            self.setNetworkAccessManager(manager)

        def userAgentForUrl(self, url):
            return "Mozilla/122.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"

        def crawl( self ):
            signal.signal( signal.SIGINT, signal.SIG_DFL )
            self.connect( self, SIGNAL( 'loadFinished(bool)' ), self._finished_loading )
            self.mainFrame().load( QUrl( self._url ) )

        def _finished_loading( self, result ):
            file = open( self._file, 'w' )
            file.write( self.mainFrame().toHtml() )
            file.close()
            sys.exit( 0 )

    def main():
        app = QApplication( sys.argv )
        options = get_cmd_options()
        crawler = Crawler( options.url, options.file )
        crawler.crawl()
        sys.exit( app.exec_() )

    def get_cmd_options():
        """
            gets and validates the input from the command line
        """
        usage = "usage: %prog [options] args"
        parser = OptionParser(usage)
        parser.add_option('-u', '--url', dest = 'url', help = 'URL to fetch data from')
        parser.add_option('-f', '--file', dest = 'file', help = 'Local file path to save data to')

        (options,args) = parser.parse_args()

        if not options.url:
            print 'You must specify an URL.',sys.argv[0],'--help for more details'
            exit(1)
        if not options.file:
            print 'You must specify a destination file.',sys.argv[0],'--help for more details'
            exit(1)

        return options

    if __name__ == '__main__':
        main()

谁能告诉我为什么它不选择标题设置?

4

1 回答 1

5

移动setRawHeader内部createRequest功能,它将起作用。您可以在此处发送测试请求。

def __init__(self, url):
    QNetworkAccessManager.__init__(self)
    request = QNetworkRequest(QUrl(url))
    self.reply = self.get(request)

def createRequest(self, operation, request, data):
    print("mymanager handles ", request.url())
    request.setRawHeader('User-Agent', 'Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101')
    request.setRawHeader("Accept-Language","en-us,en;q=0.5");
    request.setRawHeader("Accept-Charset","ISO-8859-1,utf-8;q=0.7,*;q=0.7");
    request.setRawHeader("Connection","keep-alive");
    return QNetworkAccessManager.createRequest( self, operation, request, data )

注意我也User-agent改为User-Agent

于 2013-10-08T23:50:00.647 回答