python - 读取 URL 后捕获数据块

Question

我在下面有预期的输出..我正在尝试读取一个 URL，能够成功读取它，但是当我尝试在“Combo”块下捕获数据时，我遇到了一个错误，关于如何解决这个问题的任何输入？

# Version YYYYMMDD
version = "20121112"

# File type to be output to logs
# Should be changed to exe before building the exe.
fileType = "py"

# Import sys to read command line arguments
import sys, getopt
#import pdb
#pdb.set_trace()

import argparse
import urllib
import urllib2
import getpass
import re

def update (url):
    print url

    authhost = 'https://login.company.com'
    # Siteminder test server
    user = getpass.getuser()
    password = getpass.getpass()
    realm = None

    # handle the authentication and cookies
    cookiehand = urllib2.HTTPCookieProcessor()
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(user=user,
                              passwd=password,
                              uri=authhost,
                              realm=realm)
    auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(auth_handler, cookiehand)
    urllib2.install_opener(opener)
    #make the request
    req = urllib2.Request(url=url)
    try:
        f = urllib2.urlopen(req)
        txt = f.read()
        f.close()
    except urllib2.HTTPError, e:
        txt = ''
        print 'An error occured connecting to the wiki. No wiki page will be generated.'
        return '<font color=\"red\">QWiki</font>'
    # Find the start tag of the textarea with Regular Expressions
    print txt
    p = re.compile('<Combo[^>]*>')
    m = p.search(txt)
    (tagStart, tagEnd) = m.span()
    # Find the end of the textarea
    endTag = txt.index("</textarea>")

def main ():
    #For logging
    print "test"
    parser = argparse.ArgumentParser(description='This is the update.py script created by test')
    parser.add_argument('-u','--url',action='store',dest='url',default=None,help='<Required> url link',required=True)
    results = parser.parse_args()# collect cmd line args
    url = results.url
    #print url
    update(url)
if __name__ == '__main__':
    main()

当前输出：-

C:\Dropbox\scripts>python announce_update.py --u "http://qwiki.company.com/component/w/index.php?title=Test1&action=raw"
test
http://qwiki.company.com/component/w/index.php?title=Test1&action=raw
Password:
==== <font color="#008000">Combo</font> ====

{| border="1" cellspacing="1" cellpadding="1"
|-
! bgcolor="#67B0F9" scope="col" | test1
! bgcolor="#67B0F9" scope="col" | test2
! bgcolor="#67B0F9" scope="col" | test3
! bgcolor="#67B0F9" scope="col" | test4
|-
| [http:link.com]
|}

==== <font color="#008000">COde:</font> ====
Traceback (most recent call last):
  File "announce_update.py", line 66, in <module>
    main()
  File "announce_update.py", line 64, in main
    update(url)
  File "announce_update.py", line 52, in update
    (tagStart, tagEnd) = m.span()
AttributeError: 'NoneType' object has no attribute 'span'

预期输出：-

{| border="1" cellspacing="1" cellpadding="1"
|-
! bgcolor="#67B0F9" scope="col" | test1
! bgcolor="#67B0F9" scope="col" | test2
! bgcolor="#67B0F9" scope="col" | test3
! bgcolor="#67B0F9" scope="col" | test4
|-
| [http:link.com]
|}

score 1 · Accepted Answer

p.search(txt)如果在文本中找不到None该模式，则返回。导致错误。ptxtNone.span

要从 html 中的第一个<textarea>元素中提取文本，您可以使用BeautifulSoup(html parser) 而不是正则表达式：

from bs4 import BeautifulSoup # pip install beautifulsoup4

soup = BeautifulSoup(txt)
print(soup.textarea.string)

您可以尝试仅使用HTMLParserstdlib 执行相同操作：

#!/usr/bin/env python
import cgi

try:
    from html.parser import HTMLParser
except ImportError: # Python 2
    from HTMLParser import HTMLParser

try:
    from urllib.request import urlopen
except ImportError: # Python 2
    from urllib2 import urlopen

url = 'http://qwiki.company.com/component/w/index.php?title=Test1&action=raw'
tag = 'textarea'

class Parser(HTMLParser):
    """Extract tag's text content from html."""
    def __init__(self, html, tag):
        HTMLParser.__init__(self)
        self.contents = []
        self.intag = None
        self.tag = tag
        self.feed(html)

    def handle_starttag(self, tag, attrs):
        self.intag = (tag == self.tag)
    def handle_endtag(self, tag):
        self.intag = False
    def handle_data(self, data):
        if self.intag:
            self.contents.append(data)

# download and convert to Unicode
response = urlopen(url)
_, params = cgi.parse_header(response.headers.get('Content-Type', ''))
html = response.read().decode(params['charset'])

# parse html (extract text from the first `<tag>` element)
content = Parser(html, tag).contents[0]
print(content)

score 0 · Accepted Answer

该错误表明您的字符串m为空/未定义。

此外，您的正则表达式似乎无论如何都找不到正确的文本，因为它会停在</font>.

re我在http://docs.python.org/2/howto/regex.html找到了一个很好的使用参考

读完之后，我认为您需要这样的表达

p = re.compile(r'>Combo<.*({.*})');

注意r表示raw字符串，告诉 Python 不要解释反斜杠等；我用括号创建了一个“组”，因此您可以提取“仅匹配的这一点”。现在当你搜索

m = p.match();

您应该能够仅提取后面的第一组大括号中的>Combo<位

myText = m.group(1);

这可能并不完美，但应该非常接近 - 我试图表明您需要找到“>Combo< 之后的第一个左大括号直到下一个右大括号”。括号表示“这是我想要的位”，索引group从match对象中提取它。

python - 读取 URL 后捕获数据块

2 回答 2

Related

Reference