1

我有这个代码:

#!/usr/local/bin/python
# -*- coding: utf-8 -*-

import re
import urllib2
import BeautifulSoup
import csv

origin_site = 'http://typo3.nimes.fr/index.php?id=annuaire_assos&theme=0&rech=&num_page='

get_url = re.compile(r"""window.open\('(.*)','','toolbar=0,""", re.DOTALL).findall

pages = range(1,2)

for page_no in pages:
    req = ('%s%s' % (origin_site, page_no))
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    try:
        urllib2.urlopen(req)
    except urllib2.URLError, e:
        pass 
    else:
        # do something with the page
        doc = urllib2.urlopen(req)
        soup = BeautifulSoup.BeautifulSoup(doc)
        infoblock = soup.findAll('tr', { "class" : "menu2" })
        for item in infoblock:
            assoc_data = []
            soup = BeautifulSoup.BeautifulSoup(str(item))
            for tag in soup.recursiveChildGenerator():
                if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
                    if tag.string is not None:
                        assoc_name = (tag.string)
                if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('u'):
                    if tag.string is not None:
                        assoc_theme = (tag.string)

            get_onclick = str(soup('a')[0]['onclick']) # get the 'onclick' attribute
            url = get_url(get_onclick)[0]

            try:
                urllib2.urlopen(url)
            except urllib2.URLError, e:
                pass 
            else:
                assoc_page = urllib2.urlopen(url)
                #print assoc_page, url
                soup_page = BeautifulSoup.BeautifulSoup(assoc_page)
                assoc_desc = soup_page.find('table', { "bgcolor" : "#FFFFFF" })
                #print assoc_desc
                get_address = str(soup_page('td', { "class" : "menu2" }))
                soup_address = BeautifulSoup.BeautifulSoup(get_address)
                for tag in soup_address.recursiveChildGenerator():
                    if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
                        if tag.string is not None:
                            assoc_email = (tag.string)
                assoc_data.append(assoc_theme)
                assoc_data.append(assoc_name)
                assoc_data.append(assoc_email)
                for tag in soup_address.recursiveChildGenerator():
                    if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
                        if tag.string is not None:
                            if tag.string != ' ':
                                get_string = BeautifulSoup.BeautifulSoup(tag.string)
                                assoc_data.append(get_string)
                                #data.append(get_string)

            c = csv.writer(open("MYFILE.csv", "wb"))
            for item in assoc_data:
                c.writerow(item)

但得到这个错误:

UnicodeEncodeError: 'ascii' codec can't encode character u'\xc7' in position 0: ordinal not in range(128)

如何将法语字符传递到 MYFILE.csv 文件中?我可以进一步改进代码吗?

4

3 回答 3

3

问题是我没有正确使用 unicode,这是最新的代码

#!/usr/local/bin/python
# -*- coding: utf-8 -*-

import urllib2
import BeautifulSoup
import csv

origin_site = 'http://typo3.nimes.fr/index.php?id=annuaire_assos&theme=0&rech=&num_page='

pages = range(1,21)

assoc_table = []

for page_no in pages:
    print page_no
    req = ('%s%s' % (origin_site, page_no))
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    try:
        doc = urllib2.urlopen(req)
    except urllib2.URLError, e:
        pass 
    else:
        # do something with the page    
        soup = BeautifulSoup.BeautifulSoup(doc)
        for row in soup.findAll('tr', { "class" : "menu2" }):
            assoc_data = []
            item = row.renderContents()
            soup = BeautifulSoup.BeautifulSoup(item)
            # we get the Thème
            for assoc_theme in soup.findAll('u'):
                assoc_data.append(assoc_theme.renderContents())
            # we get the Nom de l'association
            for assoc_name in soup.findAll('td', { "width": "70%"}):
                assoc_data.append(assoc_name.renderContents())
            # we list all the links to the indivudual pages
            for i in soup.findAll('a', {'href':'#'}):
                if 'associations' in i.attrMap['onclick']:
                    req = i.attrMap['onclick'].split('\'')[1]
                    try:
                        doc = urllib2.urlopen(req)
                    except urllib2.URLError, e:
                        pass
                    else:
                        soup = BeautifulSoup.BeautifulSoup(doc)
                        emails = []
                        web_sites = []
                        for tag in soup.recursiveChildGenerator():
                            if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
                                assoc_link = (tag.string)
                                if '@' in str(assoc_link):
                                    print assoc_link
                                    emails.append(assoc_link)
                        if emails != []:
                            assoc_data.append(emails[0])
                        else:
                            assoc_data.append('pas du email')
                        for tag in soup.recursiveChildGenerator():
                            if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
                                assoc_link = (tag.string)
                                if 'http' in str(assoc_link):
                                    web_sites.append(assoc_link)
                            #
                        if web_sites != []:
                            assoc_data.append(web_sites[0])
                        else:
                            assoc_data.append('pas du site web')
                        assoc_addr = [] 
                        assoc_cont = soup.findAll('td', { "width" : "49%", "class": "menu2" })
                        for i in assoc_cont:
                            assoc_addr.append(i.renderContents())
                        assoc_tels = []
                        for addr in assoc_addr:
                            assoc_data.append(addr)
                        assoc_tel = soup.findAll('td', { "width" : "45%", "class": "menu2" })
                        for i in assoc_tel:
                            assoc_tels.append(i.renderContents())
                        assoc_data.append(assoc_tels[0])
                        print assoc_tels[0]
            assoc_table.append(assoc_data)
            print assoc_data
print assoc_table
c = csv.writer(open("nimes_assoc.csv", "wb"))
for item in assoc_table:
    #print item
    c.writerow(item)

感谢您的所有帮助以及来自utor@python.org 邮件列表的帮助

于 2012-10-15T17:22:01.340 回答
3

看起来 urllib2 的结果是 unicode,但 CSV 模块不兼容 Unicode,但兼容 8 位。

相反,您必须在编写每个字符串之前将其转换为 UTF-8。例如:

       c = csv.writer(open("MYFILE.csv", "wb"))
       for item in assoc_data:
         # Ensure item is an object and not an empty unicode string
         if item and item != u'':
           c.writerow([item.encode("UTF-8")])
于 2012-10-14T22:24:33.220 回答
3

滚动到底部: http: //docs.python.org/library/csv.html

具体来说,使用这个作家:

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

然后,而不是

c = csv.writer(open("MYFILE.csv", "wb"))

利用

c = UnicodeWriter(open("MYFILE.csv", "wb"))
于 2012-10-14T23:41:23.627 回答