当我下载英文圣经时,我的脚本有效。但是当我下载外国圣经时给我一个 ascii 错误。
Python
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and converting Bibles to Aurora...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
namesave = '%s.html' % '.'.join(name.split('.')[:-1])
chnum = name.split('.')[-2]
dirname = urlparse.urlparse(url).path.split('.')[-1]
try:
f = urllib2.urlopen(url)
except urllib2.URLError:
print "Bad URL or timeout"
continue
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
thearticle = soup.html.body.article
bookname = thearticle['data-book-human']
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
converted = str(soup)
full_path = os.path.join(dirname, namesave)
open(full_path, 'wb').write(converted)
print(name)
print("DOWNLOADS AND CONVERSIONS COMPLETE!")
有效的links.html
<a href="http://www.youversion.com/bible/john.6.ceb">http://www.youversion.com/bible/john.6.ceb</a>
给出错误的links.html
<a href="http://www.youversion.com/bible/john.6.nav">http://www.youversion.com/bible/john.6.nav</a>
错误
File "test.py", line 32, in <module>
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-4: ordinal not in range(128)