python - 刮黄页

Question

我正在尝试从people.yellowpages.com 抓取数据，我只需要电子邮件、电话、地址。我最近一直在编写此代码，它适用于与业务相关的组织。但是在搜索人员数据时它不起作用。任何人都可以帮助我解决我在这里做错了什么。

注意：我需要从people.yellowpages.com 抓取人员数据。当我尝试运行程序时，它会通过 for 循环然后出错。

import requests
from lxml import html
import unicodecsv as csv
import argparse
import time
def parse_listing(keyword):
    """
    
    Function to process 
    : param keyword: search query
    : param place : place name

    """
    url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
    print("retrieving ",url)

    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7',
                'Cache-Control':'max-age=0',
                'Connection':'keep-alive',
                'Host':'www.yellowpages.com',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
            }
    # Adding retries
    print("helllo")
    for retry in range(10):
        try:
            response = requests.get(url,verify=False, headers = headers )
            print("parsing page")
            print(response)
            sleep(10)
            if response.status_code==200:
                parser = html.fromstring(response.text)
                #making links absolute
                base_url = "https://people.yellowpages.com/whitepages/?last_name={}".format(keyword)
                parser.make_links_absolute(base_url)
                print(base_url)
                XPATH_LISTINGS = "//div[@class='main-content']//div[@class='phone-result']" 
                listings = parser.xpath(XPATH_LISTINGS)
                scraped_results = []
                print("wait")

                for results in listings:
                    XPATH_fullname = ".//a[@class='fullname']//text()" 
                    XPATH_phone = ".//div[@itemprop='phone']//text()"
                    XPATH_address = ".//div[@class='info']//div//p[@itemprop='address']"
                    #XPATH_AGE = "//*[@id="center"]/div[1]/div/div[1]/div[3]/p"

                    raw_fullname = results.xpath(XPATH_fullname)
                    raw_phone = results.xpath(XPATH_phone)  
                    #raw_AGE = results.xpath(XPATH_AGE)
                    raw_address = results.xpath(XPATH_address)
                    print("worked")
                    fullname = ''.join(raw_fullname).strip() if raw_fullname else None
                    phone = ''.join(raw_phone).strip() if raw_phone else None
                    address = ''.join(raw_address).strip() if raw_address else None
                    #age = ''.join(raw_AGE).strip() if raw_zip_code else None


                    business_details = {
                                        'name':fullname,
                                        'telephone':phone,
                                        'address':address,
                                        #'age':AGE,
                                        'listing_url':response.url
                    }
                    scraped_results.append(business_details)

                return scraped_results
                print(business_details)

            elif response.status_code==404:
                print("Could not find a location matching",keyword)
                #no need to retry for non existing page
                break
            else:
                print("Failed to process page")
                return []
                
        except:
            print("Failed to process page")
            return []


if __name__=="__main__":
    
    argparser = argparse.ArgumentParser()
    argparser.add_argument('keyword',help = 'keyword')
    #argparser.add_argument('place',help = 'Place Name')
    
    args = argparser.parse_args()
    keyword = args.keyword
    #place = args.place
    scraped_data =  parse_listing(keyword,) 
    
    if scraped_data:
        print("Writing scraped data to %s-%s-scraped-data.csv"%(keyword))
        with open('%s-%s-scraped-data.csv'%(keyword,),'wb') as csvfile:
            fieldnames = ['NAME','telephone','ADDRESS','listing_url']
            writer = csv.DictWriter(csvfile,fieldnames = fieldnames,quoting=csv.QUOTE_ALL)
            writer.writeheader()
            for data in scraped_data:
                writer.writerow(data)

score 0 · Accepted Answer

永远不要那样做：

except:

你总是必须指定某些例外。让我们尝试手动运行requests.get：

(Pdb) requests.get(url,verify=False, headers = headers )
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
....
*** requests.exceptions.TooManyRedirects: Exceeded 30 redirects.

查看错误：requests.exceptions.TooManyRedirects: Exceeded 30 redirects 让我们尝试不使用allow_redirect：

(Pdb) response = requests.get(url,verify=False, headers = headers,  allow_redirects=False)
/usr/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
(Pdb) response
<Response [301]>
(Pdb) response.headers
{'Date': 'Mon, 18 Nov 2019 09:09:35 GMT', 'Content-Type': 'text/html', 'Content-Length': '178', 'Connection': 'keep-alive', 'Location': 'https://people.yellowpages.com/whitepages/?last_name=john', 'Set-Cookie': 'TS0145ce01=01d0bb65df96e04f8ea20dfc3b81c2fbe967f216df827b11fbedaa89ee06a10f05ae6a0759; Path=/'}
(Pdb) url
'https://people.yellowpages.com/whitepages/?last_name=john'
(Pdb) response.headers["Location"]
'https://people.yellowpages.com/whitepages/?last_name=john'

您是否看到 Web 服务器总是将您重定向到相同的 url？可能是问题

'Host':'www.yellowpages.com',

在标题中？

python - 刮黄页

1 回答 1

Related

Reference