0

我用了这个脚本

from twisted.internet import reactor, threads
from urlparse import urlparse
import httplib
import itertools


concurrent = 200
finished=itertools.count(1)
reactor.suggestThreadPoolSize(concurrent)

def getStatus(ourl):
    url = urlparse(ourl)
    conn = httplib.HTTPConnection(url.netloc)   
    conn.request("HEAD", url.path)
    res = conn.getresponse()
    return res.status

def processResponse(response,url):
    print response, url
    processedOne()

def processError(error,url):
    print "error", url#, error
    processedOne()

def processedOne():
    if finished.next()==added:
        reactor.stop()

def addTask(url):
    req = threads.deferToThread(getStatus, url)
    req.addCallback(processResponse, url)
    req.addErrback(processError, url)   

added=0
for url in open('urllist.txt'):
    added+=1
    addTask(url.strip())

try:
    reactor.run()
except KeyboardInterrupt:
    reactor.stop()

当我尝试运行脚本 $ python test.py

它只是打印 url 而不是 cUrl 或发送 HTTP 请求..

我如何为每个发送 HTTP 或 cURL 进程

谢谢

4

2 回答 2

0

测试代码,使用inlineCallbacksand deferToThread。还defer.gatherResults用于知道何时处理了所有延迟(而不是 OP 中的计数器方法):

from twisted.internet import reactor, defer, utils
from twisted.internet.threads import deferToThread
from urlparse import urlparse
import httplib

threadDeferred = deferToThread.__get__

@threadDeferred
def get_url_head(url_arg):
  url = urlparse(url_arg)
  conn = httplib.HTTPConnection(url.netloc)   
  conn.request("HEAD", url.path)
  res = conn.getresponse()
  conn.close()
  return res.status

@defer.inlineCallbacks
def check_url(sem,url_arg):
  yield sem.acquire()
  try:
    result = yield get_url_head(url_arg)
    defer.returnValue(result)
  finally:
    sem.release()

@defer.inlineCallbacks
def run(reactor,SEMAPHORE_SIZE=10):
  sem = defer.DeferredSemaphore(SEMAPHORE_SIZE)
  deferreds = []
  failed_urls = []
  responded_urls = []
  with open('urllist.txt','r') as f:
    for line in f:
      url_arg = line.strip()
      d = check_url(sem,url_arg)
      d.addCallback(processResult,url_arg,responded_urls).addErrback(processErr,url_arg,failed_urls)
      deferreds.append(d)
  res = yield defer.gatherResults(deferreds)
  # Do something else with failed_urls and responded_urls
  reactor.callLater(0,reactor.stop)

def main():
  from twisted.internet import reactor
  reactor.callWhenRunning(run,reactor)
  reactor.run()

def processResult(result,url_arg,responded_urls):
  print "Reponse %s from %s" % (result,url_arg)
  responded_urls.append((url_arg,result))

def processErr(err,url_arg,failed_urls):
  print "Error checking %s: %s" % (url_arg,repr(err.value))
  failed_urls.append((url_arg,err.value))

if __name__ == '__main__':
  main()
于 2013-08-27T20:10:42.007 回答
0

如果您的网址格式包含“http://”,这应该可以工作。但是,如果它们确实包含“http://”,则在评论中有解决方案

import httplib

def requester(url):
    host = url.split('/')[0]
    #if urls do contain 'http://' -->  host = url.split('/')[2].replace('http://','')
    req = url[url.find(host)+len(host):]
    conn = httplib.HTTPConnection(host)
    conn.request("HEAD","/"+req)
    response = conn.getresponse()
    print response.status, response.reason

    #if you want data...
    #data = response.read()
    #print data

for url in open(urls.txt):
    try:
        requester(url)
    except Error,e:
        print Error, e

此外,我建议查看httplib

于 2013-08-27T20:41:56.417 回答