我正在尝试从people.yellowpages.com 抓取数据,我只需要电子邮件、电话、地址。我最近一直在编写此代码,它适用于与业务相关的组织。但是在搜索人员数据时它不起作用。任何人都可以帮助我解决我在这里做错了什么。
注意:我需要从people.yellowpages.com 抓取人员数据。当我尝试运行程序时,它会通过 for 循环然后出错。
import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
"""
Function to process
: param keyword: search query
: param place : place name
"""
url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
print("retrieving ",url)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.yellowpages.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
# Adding retries
print("helllo")
for retry in range(10):
try:
response = requests.get(url,verify=False, headers = headers )
print("parsing page")
print(response)
sleep(10)
if response.status_code==200:
parser = html.fromstring(response.text)
#making links absolute
base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
parser.make_links_absolute(base_url)
print(base_url)
XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']"
listings = parser.xpath(XPATH_LISTINGS)
scraped_results = []
print("wait")
for results in listings:
XPATH_fullname = ".//a[@class='fullname']//text()"
XPATH_phone = ".//div[@itemprop='phone']//text()"
XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
#XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"
raw_fullname = results.xpath(XPATH_fullname)
raw_phone = results.xpath(XPATH_phone)
#raw_AGE = results.xpath(XPATH_AGE)
raw_address = results.xpath(XPATH_address)
print("worked")
fullname = ''.join(raw_fullname).strip() if raw_fullname else None
phone = ''.join(raw_phone).strip() if raw_phone else None
address = ''.join(raw_address).strip() if raw_address else None
#age = ''.join(raw_AGE).strip() if raw_zip_code else None
business_details = {
'name':fullname,
'telephone':phone,
'address':address,
#'age':AGE,
'listing_url':response.url
}
scraped_results.append(business_details)
return scraped_results
print(business_details)
elif response.status_code==404:
print("Could not find a location matching",keyword)
#no need to retry for non existing page
break
else:
print("Failed to process page")
return []
except:
print("Failed to process page")
return []
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('keyword',help = 'keyword')
#argparser.add_argument('place',help = 'Place Name')
args = argparser.parse_args()
keyword = args.keyword
#place = args.place
scraped_data = parse_listing(keyword,)
if scraped_data:
print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
fieldnames = ['NAME','telephone','ADDRESS','listing_url']
writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for data in scraped_data:
writer.writerow(data)