请仔细阅读我的解析器的代码。它从循环访问它们的网页中获取一些统计信息,并将指定的记录放入 SQLite3 数据库中。
一切正常,直到第 87 行(SQL 语句),该进程消耗了所有 CPU 资源,实际上被阻塞了。
文件“./parser.py”,第 86 行,在 while (j < i) 中:
代码开头的数据库文件以正确的结构创建,因此问题出在循环中。主循环的内部块for season in season_list:
工作得很好。这是我的脚本的全部代码:
#!/usr/bin/env python
from bs4 import BeautifulStoneSoup
from urllib2 import urlopen
import re
import sqlite3
from time import gmtime, strftime
# Print start time
print "We started at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Create DB
print "Trying to create DB"
con = sqlite3.connect('england.db')
cur = con.cursor()
sql = """\
CREATE TABLE english_premier_league (
id_match INTEGER PRIMARY KEY AUTOINCREMENT,
season TEXT,
tour INTEGER,
date TEXT,
home TEXT,
visitor TEXT,
home_score INTEGER,
visitor_score INTEGER
);
"""
try:
cur.executescript(sql)
except sqlite3.DatabaseError as err:
print "Error creating database: ", err
else:
print "Succesfully created your database..."
con.commit()
cur.close()
con.close()
# list of variables
postfix = 2011
threshold = 1999
season_list = []
while postfix >= threshold:
end = (postfix + 1) % 2000
if (end >= 10):
season = str(postfix) + str(end)
else:
season = str(postfix) + str(0) + str(end)
season_list.append(season)
postfix -= 1
print season_list
# main loop
for season in season_list:
href = 'http://www.stat-football.com/en/a/eng.php?b=10&d='+season+'&c=51'
print href
xml = urlopen(href).read()
xmlSoup = BeautifulStoneSoup(xml)
tablet = xmlSoup.find(attrs={"class" : "bd5"})
#Access DB
con = sqlite3.connect('england.db')
cur = con.cursor()
#Parse site
tour = tablet.findAll(attrs = { "class" : re.compile(r"^(s3|cc s3)$") })
date = tablet.findAll(text = re.compile(r"(0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[012])\.(19|20)\d\d"))
home = tablet.findAll(attrs = {"class" : "nw"})
guest = tablet.findAll(attrs = {"class" : "s1"})
score = tablet.findAll(attrs = {"class" : "nw pr15"})
#
def parse_string(sequence):
result=[]
for unit in sequence:
text = ''.join(unit.findAll(text=True))
result.append(text.strip())
return result
tour_list=parse_string(tour)
home_list=parse_string(home)
guest_list=parse_string(guest)
score_list=parse_string(score)
#Loop over found records to put them into sqlite3 DB
i = len(tour_list)
j = 0
while (j < i):
sql_add = 'INSERT INTO english_premier_league (season, tour, date, home, visitor, home_score, visitor_score) VALUES (?, ?, ?, ?, ?, ?, ?)'
match = (season, int(tour_list[j]), date[j], home_list[j], guest_list[j], int(score_list[j][0:1]), int(score_list[j][2:3]))
try:
cur.executemany(sql_add, match)
except sqlite3.DatabaseError as err:
print "Error matching the record: ", err
else:
con.commit()
part = float(j)/float(i)*100
if (part%10 == 0):
print (int(part)), "%"
j += 1
cur.close()
con.close()
查看 strace 输出的结尾也可能很有用:
getcwd("/home/vitaly/football_forecast/epl", 512) = 35 stat("/home/vitaly/football_forecast/epl/england.db", {st_mode=S_IFREG|0644, st_size=24576, ...}) = 0 open("/home/vitaly/football_forecast/epl/england.db", O_RDWR|O_CREAT, 0644) = 3 fcntl(3, F_GETFD) = 0 fcntl(3, F_SETFD, FD_CLOEXEC) = 0 fstat(3, { st_mode=S_IFREG|0644, st_size=24576, ...}) = 0 lseek(3, 0, SEEK_SET) = 0 read(3, "SQLite 格式 3\0\4\0\1\1\0@ \0 \0\1~\0\0\0\30"..., 100) = 100
我在 Ubuntu 12.04 上运行 Python 2.7。非常感谢。