html - 如何增加浏览器响应

Question

我有一个程序，它使用 RoboBrowser 向网站发送大量请求并获得答案，但现在我需要将这些答案过滤到只有那些没有此字符串“Case Status Not Available”的答案我试图使用 beautifulsoup，但它返回错误。

这是到目前为止的代码：

import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser

def rename_files():
    file_list = os.listdir(r"C:\\PROJECT\\pdfs")
    print(file_list)
    saved_path = os.getcwd()
    print('Current working directory is '+saved_path)
    os.chdir(r'C:\\PROJECT\\pdfs')
    for file_name in file_list:
        os.rename(file_name, file_name.translate(None, " "))
    os.chdir(saved_path)
rename_files()

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, file_name, file_path)
        success, output, errors = run("python %s -o %s.html %s " %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
    writer = csv.writer(f)
    for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
        for file_name in file_name_list:
            if not file_name.endswith('.html'):
                continue
            with open(file_name) as markup:
                soup = BeautifulSoup(markup.read())
                text = soup.get_text()
                match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
                print(match)
                writer.writerow(match)
                for item in match:
                    data = item.split('/')
                    case_number = data[0]
                    case_year = data[1]

                    browser = RoboBrowser()
                    browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
                    form = browser.get_forms()[0]  # Get the first form on the page
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
                    form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year

                    browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])

                    # Use BeautifulSoup to parse this data
                    print(browser.response.text)
                    souptwo = BeautifulSoup(browser.response.text)
                    texttwo = soup.get_text()
                    matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
                    if not matchtwo:
                        soupthree = BeautifulSoup(browser.response.text)
                        print soupthree

返回的错误是：

Traceback (most recent call last):
  File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
    matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable

score 0 · Accepted Answer

第 87 行包括尝试调用的方法findall。在第 65 行定义，调用它来解析文件的内容。由于错误诊断显示为 None 这意味着无法解析该文件。soupsoupBeautifulSoupsoupBeautifulSoup

html - 如何增加浏览器响应

1 回答 1

Related

Reference