0

我在将所有这些数据写入电子表格时遇到问题。目前,它基本上只将第一个链接的数据写入电子表格;我似乎无法从所有列中获取数据。这是我第一次使用xlwt,所以我认为这可能是由于我对如何将数据写入列的误解。我一直在查看其他代码,但似乎仍然无法弄清楚。

我觉得我需要用 for 循环做点什么?

from BeautifulSoup import BeautifulSoup, SoupStrainer, NavigableString
import requests
import xlwt
import urllib2

book = xlwt.Workbook()
sheet = book.add_sheet('sheet 1')

blog_page_url = 'http://technorati.com'
url = "http://technorati.com/blogs/directory/living/food/page-{}/"

for i in range(2):
     html_doc = requests.get("http://technorati.com/blogs/directory/living/food/page-{}/".format(i)).text
    page = BeautifulSoup(html_doc)
    x = 0
    for link in  page.findAll('a'):
        x += 1
        try:
            if 'blogs/' in link['href'] and '/directory/' not in link['href']:
                if x%3 == 0:
                    print link['href']
                    index = 0
                    sheet.write(index, 1, blog_page_url + link['href']+'\n') # links to spreadsheet
                    index +=1


                    blog_url = blog_page_url + link['href']
                    blog_page = urllib2.urlopen(blog_url)
                    blog_soup = BeautifulSoup(blog_page)
                    tech_authority_score = blog_soup.find('div', attrs = {'style': 'float: left;'})
                    last_post_group = blog_soup.find('ol', attrs = {'class': 'post-list'})
                    last_post = last_post_group.li.div
                    sheet.write(index, 2, tech_authority_score.text) # tech score to spreadsheet
                    sheet.write(index, 3, last_post.text) # date of last post to spreadsheet
                    index += 1
        except:
            y = None # do nothing

book.save("link_spreadsheet.xls")
4

0 回答 0