我一直在尝试为 yelp 编写爬虫。我想获取该页面上提供的供应商的链接,我知道它是在 href=" 中给出的,但是数组返回总是空的,请帮助!提前谢谢你:)
import urllib
import mechanize
from bs4 import BeautifulSoup
import re
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders= [('User-agent', 'chrome')]
BASE_URL = "http://www.yelp.com/"
regex = "u(?!.*u).*,"
patern =re.compile(regex)
search = "house cleaner"
location ="London, Uk"
term = search.replace(" ","+")
place = location.replace(",","%2C").replace(" ","+")
query = BASE_URL+"search?find_desc="+term+"&find_loc="+place+"&ns=1#start=0"
html = br.open(query).read()
soup = BeautifulSoup(html)
results = soup.findAll('ul',attrs={'class':'ylist ylist-bordered search-results'})
results_parse = str(results)
soup1 = BeautifulSoup(results_parse)
names =soup1.findAll("li")
for li in names:
soup2=BeautifulSoup(str(li))
links=soup2.findAll("a")
links_parse = links[0]
vendor_links=[a["href"] for a in links]
out= re.findall(patern,str(vendor_links))
print out