python - 在 python 中使用 BeautifulSoup 解析数据

Question

我需要从网站解析数据：http: //www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html

BeautifulSoup 的大部分教程都是用于解析链接，而不是从链接中深入解析所需的数据。

现在我浏览了python的BeautifulSoup模块的一些教程并编写了这个脚本来下载所需的数据字符串

 <div id="content_box">
        <div id="content" class="hfeed">...

我正在使用的脚本：

from BeautifulSoup import BeautifulSoup
import urllib2

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    bs = BeautifulSoup(data)

    postdata = bs.find('div', {'id': 'content_box'})
    postdata= [s.getText().strip() for s in postdata.findAll('div', {'class':'scdetail'})]

    fname = 'postdata.txt'
    with open(fname, 'w') as outf:
        outf.write('\n'.join(postdata))

if __name__=="__main__":
    main()

但是这个脚本没有执行我所期望的。我想像明智地将发布数据放入文件中：

职位：国家电子和信息技术学院分中心经理职位空缺 - 昌迪加尔

分中心经理

国立电子信息技术研究所

地址：NIELIT, Chandigarh SCO: 114-116 Sector 17B

邮政编码：160017

昌迪加尔市等等......

请帮助或建议。

谢谢

score 0 · Accepted Answer

您的问题出在此处：postdata.findAll('div', {'class': 'scdetail'}). 当你在寻找divs 时，页面有spans。将其更改为会postdata.findAll('span', {'class': 'scdetail'})导致非空结果。

您要读取的值之一的示例：

<div class="scheading">
    "Pay Scale: " <span class="scdetail" itemProp="baseSalary">Rs. 15,000/-</span>
</div>

score 0 · Accepted Answer

这个 pyparsing 提取器将挑选出匹配的 div/span 标签：

from pyparsing import makeHTMLTags, withAttribute, SkipTo

"""
sample:
<div class="scheading">Postal Code: <span class="scdetail" 
    itemprop="postalCode">160017</span></div>
"""
div,divEnd = makeHTMLTags("div")
span,spanEnd = makeHTMLTags("span")
div.setParseAction(withAttribute(("class","scheading")))
span.setParseAction(withAttribute(("class","scdetail")))

patt = (div + SkipTo(span)("label") + span + SkipTo(spanEnd)("value") + 
            spanEnd + divEnd)

attrs = {}
for match in patt.searchString(html):
    attrs[match.itemprop] = (match.label[0].strip(), match.value)

from pprint import pprint
pprint(attrs.items())

印刷：

[('skills',
  ('Desired Skills:',
   'Preference will be given to candidates having good knowledge of UNIX &amp; Visual FoxPro.')),
 ('qualifications',
  ('Qualifications:',
   '\x91A\x92 level of DOEACC / PGDCA with 2 years experience. ')),
 ('educationRequirements',
  ('Educational Requirements:',
   'B. E. / B. Tech. (CS / IT / Electronics) / MCA / M. Sc. (CS / IT / Electronics) / \x91B\x92 level of DOEACC ')),
 ('addressLocality', ('City', 'Chandigarh')),
 ('addressRegion', ('State', 'Haryana and Punjab')),
 ('streetAddress', ('Address:', 'NIELIT, Chandigarh SCO: 114-116 Sector 17B')),
 ('postalCode', ('Postal Code:', '160017')),
 ('baseSalary', ('Pay Scale:', 'Rs. 15,000/-'))]

score 0 · Accepted Answer

此解决方案使用 BeautifulSoup

import os
import sys

# Import System libraries
import re
import urllib2

# Import Custom libraries
from BeautifulSoup import BeautifulSoup, Tag

job_location = lambda x: x.name == "div" and set([(u"id", u"content")]) <= set(x.attrs)
job_title_location = lambda x: set([(u"class", u"schema_title"), (u"itemprop", u"title")]) <= set(x.attrs)
organ_location = lambda x: set([(u"class", u"schema_hiringorganization"), (u"itemprop", u"name")]) <= set(x.attrs)
details_key_location = lambda x: x.name == "div" and bool(re.search("s.*heading", dict(x.attrs).get(u"class", "")))

def coll_up(ilist,base=0,count=0):
    '''
    Recursively collapse nested lists at depth base and above
    '''
    tlist = []
    if(isinstance(ilist,list) or isinstance(ilist,tuple)):
        for q in ilist:
            tlist += coll_up(q,base,count+1)
    else:
        if(base > count):
            tlist = ilist
        else:
            tlist = [ilist]
    return [tlist] if((count != 0) and (base > count)) else tlist

def info_extract(ilist, count=0):
    '''
    Recursively walk a nested list and upon finding a non iterable, return its string
    '''
    tlist = []
    if(isinstance(ilist, list)):
        for q in ilist:
            if(isinstance(q, Tag)):
                tlist += info_extract(q.contents, count+1)
            else:
                extracted_str = q.strip()
                if(extracted_str):
                    tlist += [extracted_str]
    return [tlist] if(count != 0) else tlist

def main():
    url = "http://www.sarkari-naukri.in/jobs-by-qualification/b-tech/sub-centre-manager.html"
    data = urllib2.urlopen(url).read()
    soup = BeautifulSoup(data)

    job_tags = soup.findAll(job_location)
    if(job_tags):
        job_tag = job_tags[0]
        job_title = info_extract(job_tag.findAll(job_title_location))[0]
        organ = info_extract(job_tag.findAll(organ_location))[0]
        details = coll_up(info_extract(job_tag.findAll(details_key_location)), 2)

        combined_dict = dict([tuple(["Job Title:"] + job_title)] + [tuple(["Organisation:"] + organ)] + [tuple(detail) for detail in details])
        combined_list = [["Job Title:"] + job_title, ["Organisation:"] + organ] + details
        postdata = [" ".join(x) for x in combined_list]
        print postdata

        fname = "postdata.txt"
        with open(fname, "w") as outf:
            outf.write("\n".join(postdata).encode("utf8"))

if __name__=="__main__":
    main()

python - 在 python 中使用 BeautifulSoup 解析数据

3 回答 3

Related

Reference