我正在使用 cURL 通过代理连接到 XML 页面。由于某种原因,该页面没有连接。解析不是问题,因此我不会将其包含在代码中。
from bs4 import BeautifulSoup
import time #added for curl
import subprocess #added for curl
import os #added for curl
file_name = raw_input("Type the name of the new file you will create: ")
g = open(file_name+".txt",'w')
g.write("---XML Parse---\n")
curlURL= 'F:\Downloads\curl-7.31.0-rtmp-ssh2-ssl-sspi-zlib-idn-static-bin-w32\curl.exe'
with open("list.txt") as f: #file from which information will be read and used in link
for line in f:
g.write("\nPage ID: "+line.rstrip('\n')+"\n")
link = "https://somewebsite.com/+line.rstrip('\n')"
args = (curlURL+ ' -L ' +link+ ' -o c:\\temp.txt --proxy-ntlm -x http://myproxy:80 -k -U:') #using a proxy
print args
sp = subprocess.Popen(args) #run curl
sp.wait() #Wait for it to finish before proceeding
xml_string = open('C:/temp.txt', 'r').read() #read in the temporary file
time.sleep(3)
os.remove('C:/temp.txt') # clean up
soup = BeautifulSoup(xml_string)
result = soup.find('bibliographic-data')
if result is not None:
status = result['status']
g.write("\nApplication Status: "+status+"\n")
g.write("Most Recent Event Information: \n")
#...i go on to parse the document
我收到错误消息:
curl:(56) Received HTTP code 407 from proxy after CONNECT
知道为什么我被拒绝访问吗?