我编写了一个快速的小程序来从联合国教科文组织网站上刮取图书数据,其中包含有关图书翻译的信息。代码正在做我想做的事,但是当它处理大约 20 个国家/地区时,它使用了大约 6GB 的 RAM。由于我需要处理大约 200 个,因此这对我不起作用。
我不确定所有 RAM 使用量来自哪里,所以我不确定如何减少它。我假设它是保存所有书籍信息的字典,但我并不肯定。我不确定我是否应该简单地让程序为每个国家/地区运行一次,而不是处理其中的很多?或者如果有更好的方法来做到这一点?
这是我第一次写这样的东西,我是一个非常新手,自学成才的程序员,所以请指出代码中的任何重大缺陷,或者你有可能与问题没有直接关系的改进技巧在眼前。
这是我的代码,在此先感谢您的帮助。
from __future__ import print_function
import urllib2, os
from bs4 import BeautifulSoup, SoupStrainer
''' Set list of countries and their code for niceness in explaining what
is actually going on as the program runs. '''
countries = {"AFG":"Afghanistan","ALA":"Aland Islands","DZA":"Algeria"}
'''List of country codes since dictionaries aren't sorted in any
way, this makes processing easier to deal with if it fails at
some point, mid run.'''
country_code_list = ["AFG","ALA","DZA"]
base_url = "http://www.unesco.org/xtrans/bsresult.aspx?lg=0&c="
destination_directory = "/Users/robbie/Test/"
only_restable = SoupStrainer(class_="restable")
class Book(object):
def set_author(self,book):
'''Parse the webpage to find author names. Finds last name, then
first name of original author(s) and sets the Book object's
Author attribute to the resulting string.'''
authors = ""
author_last_names = book.find_all('span',class_="sn_auth_name")
author_first_names = book.find_all('span', attrs={\
'class':"sn_auth_first_name"})
if author_last_names == []: self.Author = [" "]
for author in author_last_names:
try:
first_name = author_first_names.pop()
authors = authors + author.getText() + ', ' + \
first_name.getText()
except IndexError:
authors = authors + (author.getText())
self.author = authors
def set_quality(self,book):
''' Check to see if book page is using Quality, then set it if
so.'''
quality = book.find_all('span', class_="sn_auth_quality")
if len(quality) == 0: self.quality = " "
else: self.quality = quality[0].contents[0]
def set_target_title(self,book):
target_title = book.find_all('span', class_="sn_target_title")
if len(target_title) == 0: self.target_title = " "
else: self.target_title = target_title[0].contents[0]
def set_target_language(self,book):
target_language = book.find_all('span', class_="sn_target_lang")
if len(target_language) == 0: self.target_language = " "
else: self.target_language = target_language[0].contents[0]
def set_translator_name(self,book) :
translators = ""
translator_last_names = book.find_all('span', class_="sn_transl_name")
translator_first_names = book.find_all('span', \
class_="sn_transl_first_name")
if translator_first_names == [] and translator_last_names == [] :
self.translators = " "
return None
for translator in translator_last_names:
try:
first_name = translator_first_names.pop()
translators = translators + \
(translator.getText() + ',' \
+ first_name.getText())
except IndexError:
translators = translators + \
(translator.getText())
self.translators = translators
def set_published_city(self,book) :
published_city = book.find_all('span', class_="place")
if len(published_city) == 0:
self.published_city = " "
else: self.published_city = published_city[0].contents[0]
def set_publisher(self,book) :
publisher = book.find_all('span', class_="place")
if len(publisher) == 0:
self.publisher = " "
else: self.publisher = publisher[0].contents[0]
def set_published_country(self,book) :
published_country = book.find_all('span', \
class_="sn_country")
if len(published_country) == 0:
self.published_country = " "
else: self.published_country = published_country[0].contents[0]
def set_year(self,book) :
year = book.find_all('span', class_="sn_year")
if len(year) == 0:
self.year = " "
else: self.year = year[0].contents[0]
def set_pages(self,book) :
pages = book.find_all('span', class_="sn_pagination")
if len(pages) == 0:
self.pages = " "
else: self.pages = pages[0].contents[0]
def set_edition(self, book) :
edition = book.find_all('span', class_="sn_editionstat")
if len(edition) == 0:
self.edition = " "
else: self.edition = edition[0].contents[0]
def set_original_title(self,book) :
original_title = book.find_all('span', class_="sn_orig_title")
if len(original_title) == 0:
self.original_title = " "
else: self.original_title = original_title[0].contents[0]
def set_original_language(self,book) :
languages = ''
original_languages = book.find_all('span', \
class_="sn_orig_lang")
for language in original_languages:
languages = languages + language.getText() + ', '
self.original_languages = languages
def export(self, country):
''' Function to allow us to easilly pull the text from the
contents of the Book object's attributes and write them to the
country in which the book was published's CSV file.'''
file_name = os.path.join(destination_directory + country + ".csv")
with open(file_name, "a") as by_country_csv:
print(self.author.encode('UTF-8') + " & " + \
self.quality.encode('UTF-8') + " & " + \
self.target_title.encode('UTF-8') + " & " + \
self.target_language.encode('UTF-8') + " & " + \
self.translators.encode('UTF-8') + " & " + \
self.published_city.encode('UTF-8') + " & " + \
self.publisher.encode('UTF-8') + " & " + \
self.published_country.encode('UTF-8') + " & " + \
self.year.encode('UTF-8') + " & " + \
self.pages.encode('UTF-8') + " & " + \
self.edition.encode('UTF-8') + " & " + \
self.original_title.encode('UTF-8') + " & " + \
self.original_languages.encode('UTF-8'), file=by_country_csv)
by_country_csv.close()
def __init__(self, book, country):
''' Initialize the Book object by feeding it the HTML for its
row'''
self.set_author(book)
self.set_quality(book)
self.set_target_title(book)
self.set_target_language(book)
self.set_translator_name(book)
self.set_published_city(book)
self.set_publisher(book)
self.set_published_country(book)
self.set_year(book)
self.set_pages(book)
self.set_edition(book)
self.set_original_title(book)
self.set_original_language(book)
def get_all_pages(country,base_url):
''' Create a list of URLs to be crawled by adding the ISO_3166-1_alpha-3
country code to the URL and then iterating through the results every 10
pages. Returns a string.'''
base_page = urllib2.urlopen(base_url+country)
page = BeautifulSoup(base_page, parse_only=only_restable)
result_number = page.find_all('td',class_="res1",limit=1)
if not result_number:
return 0
str_result_number = str(result_number[0].getText())
results_total = int(str_result_number.split('/')[1])
page.decompose()
return results_total
def build_list(country_code_list, countries):
''' Build the list of all the books, and return a list of Book objects
in case you want to do something with them in something else, ever.'''
for country in country_code_list:
print("Processing %s now..." % countries[country])
results_total = get_all_pages(country, base_url)
for url in range(results_total):
if url % 10 == 0 :
all_books = []
target_page = urllib2.urlopen(base_url + country \
+"&fr="+str(url))
page = BeautifulSoup(target_page, parse_only=only_restable)
books = page.find_all('td',class_="res2")
for book in books:
all_books.append(Book (book,country))
page.decompose()
for title in all_books:
title.export(country)
return
if __name__ == "__main__":
build_list(country_code_list,countries)
print("Completed.")