使用蟒蛇:
例如,假设您想从某个站点以 csv 格式抓取外汇报价,例如:fxquotes
然后...
from BeautifulSoup import BeautifulSoup
import urllib,string,csv,sys,os
from string import replace
date_s = '&date1=01/01/08'
date_f = '&date=11/10/08'
fx_url = 'http://www.oanda.com/convert/fxhistory?date_fmt=us'
fx_url_end = '&lang=en&margin_fixed=0&format=CSV&redirected=1'
cur1,cur2 = 'USD','AUD'
fx_url = fx_url + date_f + date_s + '&exch=' + cur1 +'&exch2=' + cur1
fx_url = fx_url +'&expr=' + cur2 + '&expr2=' + cur2 + fx_url_end
data = urllib.urlopen(fx_url).read()
soup = BeautifulSoup(data)
data = str(soup.findAll('pre', limit=1))
data = replace(data,'[<pre>','')
data = replace(data,'</pre>]','')
file_location = '/Users/location_edit_this'
file_name = file_location + 'usd_aus.csv'
file = open(file_name,"w")
file.write(data)
file.close()
编辑:从表中获取值:示例来自:palewire
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
mech = Browser()
url = "http://www.palewire.com/scrape/albums/2007.html"
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html)
table = soup.find("table", border=1)
for row in table.findAll('tr')[1:]:
col = row.findAll('td')
rank = col[0].string
artist = col[1].string
album = col[2].string
cover_link = col[3].img['src']
record = (rank, artist, album, cover_link)
print "|".join(record)