def dcrawl(link):
#importing the req. libraries & modules
from bs4 import BeautifulSoup
import urllib
#fetching the document
op = urllib.FancyURLopener({})
f = op.open(link)
h_doc = f.read()
#trimming for the base document
idoc1 = BeautifulSoup(h_doc)
idoc2 = str(idoc1.find(id = "bwStory"))
bdoc = BeautifulSoup(idoc2)
#extract the date as a string
dat = str(bdoc.div.div.string)[0:13]
date = dst(dat)
#extract the title as a string
title = str(bdoc.b.string)
#extract the full report as a string
freport = str(bdoc.find_all("p"))
#extract the place as a string
plc = bdoc.find(id = "bwStoryBody")
puni = plc.p.string
#encoding to ascii to eliminate discrepancies
pasi = puni.encode('ascii', 'ignore')
com = pasi.find("-")
place = pasi[:com]
相同的转换“bdoc.b.string”在这里有效:
#extract the full report as a string
freport = str(bdoc.find_all("p"))
在行中:
plc = bdoc.find(id = "bwStoryBody")
plc
返回一些数据。并plc.p
返回第一个<p>....<p>
,但将其转换为字符串不起作用。
因为puni
之前返回了一个字符串对象,我偶然发现了 unicode 错误,因此不得不使用编码来处理pasi
结果。