python - 网页抓取市盈率

Question

我正在尝试收集四只股票的市盈率。我不确定我哪里出错了，感谢您的帮助。

我认为问题在于我的贪婪和非贪婪限定符以及正则表达式 url 的复制方式。

import urllib
import re

symbolslist = ["aapl","spy","goog","nflx"]

i=0
while i<len(symbolslist):
    url = "http://finance.yahoo.com/q?s=" +symbolslist[i] +"&q1=1"
    htmlfile = urllib.urlopen(url)
    htmltext = htmlfile.read()
    regex = '<th scope="row" width="48%">"P/E "<span class="small">(ttm)</span>:    </th><td class="yfnc_tabledata1">(.+?)</td>'
    pattern = re.compile(regex)
    price_to_earnings = re.findall(pattern,htmltext)
    print "The price to earnings of", symbolslist[i]," is", price_to_earnings
    i+=1

score 0 · Accepted Answer

此解决方案使用BeautifulSoup

import re
import urllib

try:
    #Using bs4
    from bs4 import BeautifulSoup
    from bs4 import Tag
except ImportError:
    #Using bs3
    from BeautifulSoup import BeautifulSoup
    from BeautifulSoup import Tag

def check_th(iarg):
    if(iarg.name == u"th" and bool(set([(u"scope", u"row")]) <= set(iarg.attrs))):
        if(any([bool(re.search("\s*P/E\s*", str(x))) for x in iarg.contents])):
            return True
    return False

tag_locations = \
[
    lambda x: x.name == u"table" and bool(set([(u"id", u"table2")]) <= set(x.attrs)),
    lambda x: x.name == u"tr",
    check_th
]

symbolslist = ["aapl","spy","goog","nflx"]

for symbol in symbolslist:
    url = "http://finance.yahoo.com/q?s=" + symbol +"&q1=1"
    htmlfile = urllib.urlopen(url)
    htmltext = htmlfile.read()
    soup = BeautifulSoup(htmltext)
    found_tags = [soup]
    for tag_location in tag_locations:
        if(found_tags):
            found_tags = [x for found_tag in found_tags for x in found_tag.findAll(tag_location)]

    if(found_tags):
        data_tag = found_tags[0].findNextSibling(lambda x: x.name == u"td" and bool(set([(u"class", u"yfnc_tabledata1")]) <= set(x.attrs)))

        print "The price to earnings of %(company)s is %(ratio)s" % {"company" : symbol, "ratio" : data_tag.text}

python - 网页抓取市盈率

1 回答 1

Related

Reference