python - 使用python解析XML文件

Question

我有一个已经为我编写的 python 模块，用于下载和解析来自谷歌专利列表的数据。代码运行良好，直到我在 2005 年之前做任何事情。除了如何运行模块之外，我对 python 一无所知。我如何解决它？

我收到的回溯是：

Traceback (most recent call last): 
  File "C:\Users\John\Desktop\FINAL BART ALL INFO-Magic Bullet.py", line 46, in <module> 
    assert xml_file is not None
AssertionError

这是我正在使用的代码：

#Ignore all this information 
import urllib2, os, zipfile
from lxml import etree
#-------------------------------------------------------------------------------
#Ignore all this information 
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
  buff = []
  for line in data:
    if separator(line):
      if buff:
        yield ''.join(buff)
        buff[:] = []
    buff.append(line)
  yield ''.join(buff)

def first(seq,default=None):
  """Return the first item from sequence, seq or the default(None) value"""
  for item in seq:
    return item
  return default
#-------------------------------------------------------------------------------
#This is where you change the internet source file- Use the file extensions from the sheet provided.
datasrc = "http://storage.googleapis.com/patents/grant_full_text/2003/pg030107.zip"
#http://commondatastorage.googleapis.com/patents/grant_full_text/2012/ipg120117.zip
filename = datasrc.split('/')[-1]
#-------------------------------------------------------------------------------
#Ignore all this information 
if not os.path.exists(filename):
  with open(filename,'wb') as file_write:
    r = urllib2.urlopen(datasrc)
    file_write.write(r.read())

zf = zipfile.ZipFile(filename)
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')])
assert xml_file is not None
#-------------------------------------------------------------------------------
#output set your folder location here, keep double \\ between
outFolder = "C:\\PatentFiles\\"
outFilename = os.path.splitext(filename)[0]
#-------------------------------------------------------------------------------
#These outputs are the names of the files-Ignore all this information 
output = outFolder + outFilename + "_general.txt"
output2 = outFolder + outFilename + "_USCL.txt"
output3 = outFolder + outFilename + "_citation.txt"
output4 = outFolder + outFilename + "_inventor.txt"
#Open files
outFile = open(output, "w")
outFile2 = open(output2, "w")
outFile3 = open(output3, "w")
outFile4 = open(output4, "w")
#write the headers
outFile.write("Patent No.|GrantDate|Application Date|Number of Claims|Examiners|US Primary Main Classification|Assignee|Assignee Address City_State_Country|First Inventor|First Inventor Address City_State_Country| \n")
outFile2.write("Patent No.|Primary|U.S Classification| \n")
outFile3.write ("Patent No.|Citation|Citation Date|Who Cited This| \n")
outFile4.write ("Patent No.|Inventor Last Name|First Name|City|State|Country|Nationality Country|Residence Country|\n")
#-------------------------------------------------------------------------------
#Here is the count- adjust this each time you run the program for the first time.
#Run at 10 for the 1st run then 5500 afterward.
count = 0
for item in xmlSplitter(zf.open(xml_file)):
  count += 1
  #5500
  if count > 10: break  
  doc = etree.XML(item)
  #-------------------------------------------------------------------------------
  #This is where the python starts parsing the infomation.
  #This is the Start of the General Infomation file.
  docID = "~".join(doc.xpath('//publication-reference/document-id/country/text()|//publication-reference/document-id/doc-number/text()'))
  docID = docID.replace("D0","D") 
  docID = docID.replace("H000","H")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("RE0","RE")
  docID = docID.replace("~0","~")
  docID = docID.replace("US~","")

  grantdate = first(doc.xpath('//publication-reference/document-id/date/text()'))
  applicationdate = first(doc.xpath('//application-reference/document-id/date/text()'))
  claimsNum = first(doc.xpath('//number-of-claims/text()'))

  assignee1 = "-".join(doc.xpath('//assignees/assignee/addressbook/orgname/text()|//assignees/assignee/addressbook/last-name/text()|//assignees/assignee/addressbook/first-name/text()'))
  assignee1 = assignee1.replace('-',', ')
  assignee2 = "_".join(doc.xpath('//assignee/addressbook/address/*/text()'))
  assignees = str(assignee1.encode("UTF-8")) + "|" + str(assignee2.encode("UTF-8"))  

  inventors1 = first(doc.xpath('//applicants/applicant/addressbook/last-name/text()'))
  inventor2 = first(doc.xpath('//applicants/applicant/addressbook/first-name/text()'))
  inventor3 = first(doc.xpath('//applicants/applicant/addressbook/address/city/text()'))
  inventor4 = first(doc.xpath('//applicants/applicant/addressbook/address/state/text()'))
  inventor5 = first(doc.xpath('//applicants/applicant/addressbook/address/country/text()'))
  inventor = str(inventor2.encode("UTF-8") if inventor2 else inventor2) + " " + str(inventors1.encode("UTF-8") if inventors1 else inventors1)
  inventors2 = str(inventor3.encode("UTF-8") if inventor3 else inventor3) + "_" + str(inventor4) + "_" + str(inventor5)
  inventors = str(inventor) + "|" + str(inventors2)

  examiners = "~".join(doc.xpath('//examiners/primary-examiner/first-name/text()|//examiners/primary-examiner/last-name/text()'))
  examiners = examiners.replace("~",", ")

  uscl1 = first(doc.xpath('//classification-national/main-classification/text()'))

  #END FIRST TEXT FILE #-------------------------------------------------------------------------------
  #This begings the USCL file
  notprimary = first(doc.xpath('//publication-reference/document-id/country/text()'))
  notprimary = notprimary.replace("US","0")

  primary1 = first(doc.xpath('//publication-reference/document-id/country/text()'))
  primary1 = primary1.replace("US","1")

  uscl2 = "~".join(doc.xpath('//us-bibliographic-data-grant/classification-national/*/text()|//sequence-cwu/publication-reference/document-id/country/text()'))
  #-------------------------NOTE--------------------------------------------------
  #--------------------------NOTE-------------------------------------------------
  #-----------------------NOTE----------------------------------------------------
  #NOTE- RUN through count 10 then remove pound signs from two below
  uscl2 = uscl2.replace("US~", str(primary1) + "|")
  uscl2 = uscl2.replace("~", "|" + "\n" + str(docID) + "|" + str(notprimary) + "|")
  uscl2 = uscl2.replace("US", "|") 

  #END SECOND TEXT FILE #-------------------------------------------------------------------------------
  #Begin the Citation file
  citation = '~'.join(doc.xpath('//publication-reference/document-id/country/text()|//references-cited/citation/patcit/document-id/country/text()|//references-cited/citation/patcit/document-id/doc-number/text()|//references-cited/citation/patcit/document-id/kind/text()|//references-cited/citation/patcit/document-id/date/text()|//references-cited/citation/category/text()'))

  #Here is the start of the patent connectors- in the patents they exist at the end. They are replaced in this code to make pipes | for the final output
  citation = citation.replace("~A~", "$@")
  citation = citation.replace("~S~", "$@")
  citation = citation.replace("~S1~", "$@")
  citation = citation.replace("~B1~", "$@")
  citation = citation.replace("~B2~", "$@")
  citation = citation.replace("~A1~", "$@")
  citation = citation.replace("~H~", "$@")
  citation = citation.replace("~E~", "$@")


  #citation = citation.replace("~QQ~", "$@")

  #make unique citation changes here-for example when "US" or "DE" in imbeded in citation see below
  citation = citation.replace("05225US~", "05225U$|" )
  citation = citation.replace("063106 DE", "063106D!" )
  citation = citation.replace("US~US~", "US~" )
  citation = citation.replace("PCT/US", "PCT/U$")
  citation = citation.replace("PCTUS", "PCTU$")
  citation = citation.replace("WO US", "WO U$")
  citation = citation.replace("WO~US", "WO~ U$")

  #fixes for cites without pipes-see below -DONT TOUCH THESE
  citation = citation.replace("US~cited by examiner", "||cited by examiner" )
  citation = citation.replace("US~cited by other", "||cited by other" )


  #Here are the changes to return each citation into a unique row
  #If a country is only listed in the columns in Excel they need a fix like this, If KR is alone then use the code:::: citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("$@", "|")
  citation = citation.replace("~US~", "|" + "\n" + str(docID) +"|")
  citation = citation.replace("US~", "")
  citation = citation.replace("~JP~", "|" + "\n" + str(docID) +"|"+ "Foreign -JP-")
  citation = citation.replace("JP~", "Foreign -JP-" )
  citation = citation.replace("~GB~", "|" + "\n" + str(docID) +"|"+ "Foreign -GB-")
  citation = citation.replace("GB~", "Foreign -GB-" )
  citation = citation.replace("~WO~", "|" + "\n" + str(docID) +"|"+ "Foreign -WO-")
  citation = citation.replace("WO~", "Foreign -WO-" )
  citation = citation.replace("~CA~", "|" + "\n" + str(docID) +"|"+ "Foreign -CA-")
  citation = citation.replace("~DE~EP~", "~DE~ EP-" )
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("DE~", "Foreign -DE-" )
  citation = citation.replace("~KR~", "|" + "\n" + str(docID) +"|"+ "Foreign -KR-")
  citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("~EM~", "|" + "\n" + str(docID) +"|"+ "Foreign -EM-")
  citation = citation.replace("~CH~", "|" + "\n" + str(docID) +"|"+ "Foreign -CH-")
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("~SE~", "|" + "\n" + str(docID) +"|"+ "Foreign -SE-")
  citation = citation.replace("~FR~", "|" + "\n" + str(docID) +"|"+ "Foreign -FR-")
  citation = citation.replace("~FR~EP~", "~FR~ EP-" )
  citation = citation.replace("FR~", "Foreign -FR-" )
  citation = citation.replace("~CN~", "|" + "\n" + str(docID) +"|"+ "Foreign -CN-")
  citation = citation.replace("~TW~", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("~TW", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("TW~", "Foreign -TW-" )
  citation = citation.replace("~NL~", "|" + "\n" + str(docID) +"|"+ "Foreign -NL-")
  citation = citation.replace("~BR~", "|" + "\n" + str(docID) +"|"+ "Foreign -BR-")
  citation = citation.replace("~AU~", "|" + "\n" + str(docID) +"|"+ "Foreign -AU-")
  citation = citation.replace("~ES~", "|" + "\n" + str(docID) +"|"+ "Foreign -ES-")
  citation = citation.replace("~IT~", "|" + "\n" + str(docID) +"|"+ "Foreign -IT-")
  citation = citation.replace("~SU~", "|" + "\n" + str(docID) +"|"+ "Foreign -SU-")
  citation = citation.replace("~AT~", "|" + "\n" + str(docID) +"|"+ "Foreign -AT-")
  citation = citation.replace("~BE~", "|" + "\n" + str(docID) +"|"+ "Foreign -BE-")
  citation = citation.replace("~DK~", "|" + "\n" + str(docID) +"|"+ "Foreign -DK-")
  citation = citation.replace("~RU~", "|" + "\n" + str(docID) +"|"+ "Foreign -RU-")
  citation = citation.replace("RU~", "Foreign -RU-" )


  #citation = citation.replace("~QQ~", "|" + "\n" + str(docID) +"|"+ "Foreign -QQ-")

  #These are just end of citation fixes-DONT TOUCH THESE
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )

  citation = citation.replace("~", "|" )

  citation = citation.replace("US", "||")

  #make unique post-processing citation changes here-If needed for the end of the scripts
  citation = citation.replace("CA|", "Foreign -CA-" )
  citation = citation.replace("EP|", "Foreign -EP-" )
  citation = citation.replace("CN|", "Foreign -CN-" )
  citation = citation.replace("$", "S")
  citation = citation.replace("D!", "DE")

  #citation = citation.replace(" ", " " )

  #END CITATION FILE-------------------------------------------------------------------------------

  #START the inventors file
  inventor1 = doc.xpath('//applicants/applicant/addressbook/last-name/text()|//applicants/applicant/addressbook/first-name/text()|//applicants/applicant/addressbook/address/city/text()|//applicants/applicant/addressbook/address/state/text()|//applicants/applicant/addressbook/address/country/text()|//applicants/applicant/nationality/*/text()|//applicants/applicant/residence/*/text()|//sequence-cwu/publication-reference/document-id/country/text()|//sequence-cwu/number/text()')
  inventor1 = '~'.join(inventor1).replace('\n-','')

  #For files after 2009 use this to replace State errors in the Excel- If the output is short then use this to add in a None value for State
  inventor1 = inventor1.replace('~KR~omitted','~None~KR~omitted')
  inventor1 = inventor1.replace('~GB~omitted','~None~GB~omitted')
  inventor1 = inventor1.replace('~IT~omitted','~None~IT~omitted')
  inventor1 = inventor1.replace('~JP~omitted','~None~JP~omitted')
  inventor1 = inventor1.replace('~FR~omitted','~None~FR~omitted')
  inventor1 = inventor1.replace('~BR~omitted','~None~BR~omitted')
  inventor1 = inventor1.replace('~NO~omitted','~None~NO~omitted')
  inventor1 = inventor1.replace('~HK~omitted','~None~HK~omitted')
  inventor1 = inventor1.replace('~CA~omitted','~None~CA~omitted')
  inventor1 = inventor1.replace('~TW~omitted','~None~TW~omitted')
  inventor1 = inventor1.replace('~SE~omitted','~None~SE~omitted')
  inventor1 = inventor1.replace('~CH~omitted','~None~CH~omitted')
  inventor1 = inventor1.replace('~DE~omitted','~None~DE~omitted')
  inventor1 = inventor1.replace('~SG~omitted','~None~SG~omitted')
  inventor1 = inventor1.replace('~IN~omitted','~None~IN~omitted')
  inventor1 = inventor1.replace('~IL~omitted','~None~IL~omitted')
  inventor1 = inventor1.replace('~CN~omitted','~None~CN~omitted')
  inventor1 = inventor1.replace('~FI~omitted','~None~FI~omitted')
  inventor1 = inventor1.replace('~ZA~omitted','~None~ZA~omitted')
  inventor1 = inventor1.replace('~NL~omitted','~None~NL~omitted')
  inventor1 = inventor1.replace('~AT~omitted','~None~AT~omitted')
  inventor1 = inventor1.replace('~AU~omitted','~None~AU~omitted')
  inventor1 = inventor1.replace('~BE~omitted','~None~BE~omitted')
  inventor1 = inventor1.replace('~CZ~omitted','~None~CZ~omitted')
  inventor1 = inventor1.replace('~RU~omitted','~None~RU~omitted')
  inventor1 = inventor1.replace('~IE~omitted','~None~IE~omitted')
  inventor1 = inventor1.replace('~AR~omitted','~None~AR~omitted')
  inventor1 = inventor1.replace('~MY~omitted','~None~MY~omitted')
  inventor1 = inventor1.replace('~SK~omitted','~None~SK~omitted')
  inventor1 = inventor1.replace('~ES~omitted','~None~ES~omitted')
  inventor1 = inventor1.replace('~NZ~omitted','~None~NZ~omitted')
  inventor1 = inventor1.replace('~HU~omitted','~None~HU~omitted')
  inventor1 = inventor1.replace('~UA~omitted','~None~UA~omitted')
  inventor1 = inventor1.replace('~DK~omitted','~None~DK~omitted')
  inventor1 = inventor1.replace('~TH~omitted','~None~TH~omitted')
  inventor1 = inventor1.replace('~MX~omitted','~None~MX~omitted')


  #inventor1 = inventor1.replace('~QQ~omitted','~None~QQ~omitted')

  #For the 2005-2008 files use these lines

  inventor1 = inventor1.replace('~NO~NO~NO','~None~NO~NO~NO')
  inventor1 = inventor1.replace('~NZ~NZ~NZ','~None~NZ~NZ~NZ')
  inventor1 = inventor1.replace('~RU~RU~RU','~None~RU~RU~RU')
  inventor1 = inventor1.replace('~RO~RO~RO','~None~RO~RO~RO')
  inventor1 = inventor1.replace('~SE~SE~SE','~None~SE~SE~SE')
  inventor1 = inventor1.replace('~SG~SG~SG','~None~SG~SG~SG')
  inventor1 = inventor1.replace('~SI~SI~SI','~None~SI~SI~SI')
  inventor1 = inventor1.replace('~TH~TH~TH','~None~TH~TH~TH')
  inventor1 = inventor1.replace('~TR~TR~TR','~None~TR~TR~TR')
  inventor1 = inventor1.replace('~TW~TW~TW','~None~TW~TW~TW')
  inventor1 = inventor1.replace('~VE~VE~VE','~None~VE~VE~VE')
  inventor1 = inventor1.replace('~ZA~ZA~ZA','~None~ZA~ZA~ZA')
  inventor1 = inventor1.replace('~AN~AN~AN','~None~AN~AN~AN')
  inventor1 = inventor1.replace('~AR~AR~AR','~None~AR~AR~AR')
  inventor1 = inventor1.replace('~BA~BA~BA','~None~BA~BA~BA')
  inventor1 = inventor1.replace('~PH~PH~PH','~None~PH~PH~PH')
  inventor1 = inventor1.replace('~HR~HR~HR','~None~HR~HR~HR')
  inventor1 = inventor1.replace('~LT~LT~LT','~None~LT~LT~LT')
  inventor1 = inventor1.replace('~EE~EE~EE','~None~EE~EE~EE')
  inventor1 = inventor1.replace('~BJ~BJ~BJ','~None~BJ~BJ~BJ')
  inventor1 = inventor1.replace('~CR~CR~CR','~None~CR~CR~CR')
  inventor1 = inventor1.replace('~PL~PL~PL','~None~PL~PL~PL')
  inventor1 = inventor1.replace('~CO~CO~CO','~None~CO~CO~CO')
  inventor1 = inventor1.replace('~UA~UA~UA','~None~UA~UA~UA')
  inventor1 = inventor1.replace('~KW~KW~KW','~None~KW~KW~KW')
  inventor1 = inventor1.replace('~CL~CL~CL','~None~CL~CL~CL')
  inventor1 = inventor1.replace('~CY~CY~CY','~None~CY~CY~CY')
  inventor1 = inventor1.replace('~LI~LI~LI','~None~LI~LI~LI')
  inventor1 = inventor1.replace('~SA~SA~SA','~None~SA~SA~SA')

  #inventor1 = inventor1.replace('~QQ~QQ~QQ','~None~QQ~QQ~QQ')

  #For lines that don't return use these lines in the code for 2009-
  inventor1 = inventor1.replace('omitted~US~','omitted~US' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~FR~','omitted~FR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DK~','omitted~DK' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~KR~','omitted~KR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~JP~','omitted~JP' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~GB~','omitted~GB' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IT~','omitted~IT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CH~','omitted~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~SG~','omitted~SG' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DE~','omitted~DE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IN~','omitted~IN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~TW~','omitted~TW' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CN~','omitted~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('omitted~QQ~','omitted~QQ' +"|"+ '\n' + str(docID) +"|")

  #for lines 2005-2008 use this line for returning countries
  inventor1 = inventor1.replace('AT~AT~AT~','AT~AT~AT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AN~AN~AN~','AN~AN~AN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AR~AR~AR~','AR~AR~AR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AU~AU~AU~','AU~AU~AU' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AZ~AZ~AZ~','AZ~AZ~AZ' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BA~BA~BA~','BA~BA~BA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BE~BE~BE~','BE~BE~BE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BR~BR~BR~','BR~BR~BR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BS~BS~BS~','BS~BS~BS' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CA~CA~CA~','CA~CA~CA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CH~CH~CH~','CH~CH~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CN~CN~CN~','CN~CN~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('QQ~QQ~QQ~','QQ~QQ~QQ' +"|"+ '\n' + str(docID) +"|")

  #special case fixes- these are for strange names fixes in the code that may not create the correct amount of columns.
  inventor1 = inventor1.replace('~None~None~NO~','~None~NO~')
  inventor1 = inventor1.replace('Ramandeep~Chandigarh','Ramandeep|None~Chandigarh')
  inventor1 = inventor1.replace('Esk~eh~r','Eskehr')
  inventor1 = inventor1.replace('Baychar~Eastport','Baychar~None~Eastport')

  inventor1 = inventor1.replace('US~1', '||||||')
  inventor1 = inventor1.replace('~','|') 

  #End the inventor file
  #-------------------------------------------------------------------------------

  #Here are the output print fields- you can change one if you want but remember to comment out all but the one you wish to view.
  print "DocID: {0}\nGrantDate: {1}\nApplicationDate: {2}\nNumber of Claims: {3}\nExaminers: {4}\nAssignee: {5}\nInventor: {6}\nUS Cl.: {7}\n".format(docID,grantdate,applicationdate,claimsNum,examiners.encode("UTF-8"),assignees,inventors,uscl1)
  #print "DocID: {0}\nU.S Cl: {1}\nPrimary: {2}\n".format(docID,uscl2,primary1)
  #print "DocID: {0}\nCitation: {1}\n".format(docID,citation.encode("UTF-8"))
  #print "DocID:    {0}\nTitle:    {1}\nInventors: {2}\n".format(docID,appID,inventor1.encode("UTF-8"))

  #------------------------------------------------------------------------------- IGNORE Everything else below this.
  #Output first general info bits
  outFile.write(str(docID) +"|"+ str(grantdate) +"|"+ str(applicationdate) + "|"+ str(claimsNum) + "|"+ str(examiners.encode("UTF-8")) + "|"+ str(uscl1) + "|"+ str(assignees) + "|"+ str(inventors)  +"|"+"\n")

  #Output Classifications only
  outFile2.write(str(docID) +"|"+ str(uscl2) +"|"+ "\n")

  #Output Citations only
  outFile3.write(str(docID) +"|"+ str(citation) +"|"+"\n")

  #Output inventors only
  outFile4.write(str(docID)  + "|"+ str(inventor1.encode("UTF-8")) + "|" +"\n")


outFile.close()
outFile2.close()
outFile3.close()
outFile4.close()
print "output files complete"

score 1 · Accepted Answer

您看到的问题不是 python 问题。代码解压缩一个 zipfile 并期望在其中找到一个 xml 文件。assert 语句是用于确保找到 xml 文件的 chack 语句。它旨在在找不到 xml 文件时停止您的程序。如果您下载分配给您的 zipfile，datasrc您会发现一个空的 zipfile。当它试图找到 xml 文件时，它没有找到一个，所以xml_file = None. 然后当它到达assert语句时，它会引发断言错误。

您可能可以取出assert并运行代码就好了，但是当您的程序崩溃时，您将不知道为什么。将它放在那里为您提供了一种方便的方法来捕捉失败的时间、地点和原因。

python - 使用python解析XML文件

1 回答 1

Related

Reference