我有这个代码:
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import re
import urllib2
import BeautifulSoup
import csv
origin_site = 'http://typo3.nimes.fr/index.php?id=annuaire_assos&theme=0&rech=&num_page='
get_url = re.compile(r"""window.open\('(.*)','','toolbar=0,""", re.DOTALL).findall
pages = range(1,2)
for page_no in pages:
req = ('%s%s' % (origin_site, page_no))
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
urllib2.urlopen(req)
except urllib2.URLError, e:
pass
else:
# do something with the page
doc = urllib2.urlopen(req)
soup = BeautifulSoup.BeautifulSoup(doc)
infoblock = soup.findAll('tr', { "class" : "menu2" })
for item in infoblock:
assoc_data = []
soup = BeautifulSoup.BeautifulSoup(str(item))
for tag in soup.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
if tag.string is not None:
assoc_name = (tag.string)
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('u'):
if tag.string is not None:
assoc_theme = (tag.string)
get_onclick = str(soup('a')[0]['onclick']) # get the 'onclick' attribute
url = get_url(get_onclick)[0]
try:
urllib2.urlopen(url)
except urllib2.URLError, e:
pass
else:
assoc_page = urllib2.urlopen(url)
#print assoc_page, url
soup_page = BeautifulSoup.BeautifulSoup(assoc_page)
assoc_desc = soup_page.find('table', { "bgcolor" : "#FFFFFF" })
#print assoc_desc
get_address = str(soup_page('td', { "class" : "menu2" }))
soup_address = BeautifulSoup.BeautifulSoup(get_address)
for tag in soup_address.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
if tag.string is not None:
assoc_email = (tag.string)
assoc_data.append(assoc_theme)
assoc_data.append(assoc_name)
assoc_data.append(assoc_email)
for tag in soup_address.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
if tag.string is not None:
if tag.string != ' ':
get_string = BeautifulSoup.BeautifulSoup(tag.string)
assoc_data.append(get_string)
#data.append(get_string)
c = csv.writer(open("MYFILE.csv", "wb"))
for item in assoc_data:
c.writerow(item)
但得到这个错误:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xc7' in position 0: ordinal not in range(128)
如何将法语字符传递到 MYFILE.csv 文件中?我可以进一步改进代码吗?