1

按照本教程,我正在尝试从 zillow.com 提取基本属性信息。更具体地说,我想提取与网站上显示的财产卡相关的信息。

在此处输入图像描述

以下代码只能提取 3 个属性的信息,即使第一页上存在多个属性卡。有人可以解释为什么代码会跳过剩余的属性吗?

import requests
import ast
from bs4 import BeautifulSoup

url = 'https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-106.43826441618356%2C%22east%22%3A-103.36483912321481%2C%22south%22%3A38.903882034738686%2C%22north%22%3A40.52008627183672%7D%2C%22mapZoom%22%3A9%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22hoa%22%3A%7B%22max%22%3A200%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22basf%22%3A%7B%22value%22%3Atrue%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schb%22%3A%7B%22min%22%3A%227%22%7D%7D%2C%22isListVisible%22%3Atrue%7D'

headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9',
            'cookie': 'zguid=23|%24ca6368b9-7b92-4d51-ab67-c2be89065efd; _ga=GA1.2.1460486079.1621047110; _pxvid=7fa13d96-b528-11eb-9860-0242ac120012; _gcl_au=1.1.2025797213.1621047113; __gads=ID=66253ab863481044:T=1621047113:S=ALNI_MZr3mehwm2Wjo7NOrmalVtEcJSXag; __pdst=50987f626deb4767a53b5d8ca2ea406a; _fbp=fb.1.1621047115574.1019382068; _pin_unauth=dWlkPU5EVm1PRGRpTVRBdE5UTTFaUzAwWlRBNExUZzJZall0TWpZMU1HWTBNV0ppWlRkbA; G_ENABLED_IDPS=google; userid=X|3|231a9d744e104379%7C3%7CiEt8bkUx9hWaFeyCeAwN9tHl_T0d0Cq-kynGuEvNYr4%3D; loginmemento=1|c2274ba4a4ad76bbe89263d30695c182e9177b9c40a2691f3054987d66a944be; zjs_user_id=%22X1-ZU158jhpb2klds9_4wzn7%22; zgcus_lbut=; zgcus_aeut=189997416; zgcus_ludi=b44a961b-c7ef-11eb-a48f-96824e7eff50-18999; optimizelyEndUserId=oeu1623111792776r0.8778663892923859; _cs_c=1; WRUIDAWS=3326630244368428; visitor_id701843=248614376; visitor_id701843-hash=4be116fbd77089f953bfb6eaf5996ef92662a6ef7d237d3c49f154ffaf4eaa9295c64fb254b106bdff234e183c94498c01af2aab; __stripe_mid=80125db1-17d1-4fc5-ae37-86b12a68709cf3da6d; g_state={"i_p":1627697570928,"i_l":4}; zjs_anonymous_id=%22ca6368b9-7b92-4d51-ab67-c2be89065efd%22; _gac_UA-21174015-56=1.1626042638.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; _gcl_aw=GCL.1626042640.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; zgsession=1|1edd82e6-372a-4546-bc8b-c2bbadfd29b4; DoubleClickSession=true; fbc=fb.1.1626412984774.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _fbc=fb.1.1626413249162.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _csrf=lV2BBFim7Vy2gFTn--PUt0VA; _gaexp=GAX1.2.w27igyYtRQaAa8XQM3MjDw.18837.2!VDVoDKTnRcyv8f4FAcJ8PA.18915.2!Khnq27RoQmSe5DEusmh5xA.18913.3; _gid=GA1.2.705011419.1630004829; FSsampler=707279376; __CT_Data=gpv=26&ckp=tld&dm=zillow.com&apv_82_www33=26&cpv_82_www33=26&rpv_82_www33=13; OptanonConsent=isIABGlobal=false&datestamp=Fri+Aug+27+2021+12%3A39%3A52+GMT-0600+(Mountain+Daylight+Time)&version=5.11.0&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C4%3A1&AwaitingReconsent=false; _cs_id=41cbdc9c-bb0b-aad9-9521-b1328a65ff77.1623111795.22.1630089665.1630089591.1.1657275795752; utag_main=v_id:01796deff9e3001a59964343177e03079002907100838$_sn:41$_se:2$_ss:0$_st:1630255637884$dc_visit:38$ses_id:1630253822479%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_event:2%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:7b8796ca-44dd-45c9-97d9-bcb642d04cd1%3Bexp-session; JSESSIONID=6CB8C410E0FE216644E8C3A0D0851618; ZILLOW_SID=1|AAAAAVVbFRIBVVsVEklf443J474nftKzJe5PKLD80sujgHvySB7tGcqZunX3BDDH9VwceMqGMTPC54%2F0q4CH%2BfmwsC6P; KruxPixel=true; _derived_epik=dj0yJnU9ai1PSUp1eHZ2Y3J3d0c2NVU1N3BBOFlHbnRBOGFzT0smbj1vLWRISDFwdUNoblN5MjQ4cTVyN213Jm09MSZ0PUFBQUFBR0VzRjRVJnJtPTEmcnQ9QUFBQUFHRXNGNFU; KruxAddition=true; search=6|1632872450375%7Crect%3D40.241821806991595%252C-103.77545313688668%252C39.18758562803622%252C-106.02765040251168%26disp%3Dmap%26mdm%3Dauto%26type%3Dhouse%252Cmultifamily%252Ctownhouse%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%0911093%09%09%09%09%09%09; _uetsid=d5e0465006a011ecbe3bd1a0f1c47d01; _uetvid=987e1c70c40a11ebaed8859af36f82fb; _px3=ba45c3df5d5d63d4d9780a102253cd60b21ab52b04778344e332e05474011c21:oCvapPXE6jD0rCXhSf4UjtEC2U956148EDyiWwRFOF8z5vwK63/hC8OWsk09O61g1spnZw64iXApZu1wOmKpyA==:1000:68UzJ5+ar5XwNm61bm41bhSHp8Zp1PfQQlL/5tcqdUIJ3RmA106//vvYGewCCwmln6acqbDAVKgqfB8Th05yX0Cw0TBW7dhfNdeNRjp9bxeLvKqZ56yuW+aVoYYp/zj6MNKv9c16vKlP771xSdCgUTvZ0CDmh7Ng55sHugOHt/jj+2Zmp2WLnuYR4rf7SEndqWBbAyQhhG4BKeyrZyEMpA==; AWSALB=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ; AWSALBCORS=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ',
            'referer': 'https://www.google.com/',
            'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            'sec-ch-ua-mobile': '?1',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'
          }

params = {
          'searchQueryState': '{"mapBounds":{"west":-106.02765040251168,"east":-103.77545313688668,"south":39.18758562803622,"north":40.241821806991595},"isMapVisible":true,"filterState":{"sort":{"value":"globalrelevanceex"},"ah":{"value":true},"con":{"value":false},"apco":{"value":false},"land":{"value":false},"apa":{"value":false},"manu":{"value":false},"basf":{"value":true},"hoa":{"max":200},"sch":{"value":true},"schb":{"min":"7"},"schc":{"value":false},"schr":{"value":false},"schu":{"value":false}},"isListVisible":true,"mapZoom":9,"customRegionId":"fcac4612c1X1-CR9xde3hldsvpa_v24ah","pagination":{}}'
          }



class ZillowScraper:

    def __init__(self, url, headers, params):

        self.headers = headers
        self.url = url
        self.params = params


    def fetch(self):

        response = requests.get(url=self.url, headers=self.headers, params=self.params)
        return response


    def get_cards_info(self, deck_text):

        urls = []

        for card in deck_text.contents:

            script = card.find('script', {'type': 'application/ld+json'})

            if script:
                script_json = ast.literal_eval(str(script.contents[0]))
                print(script_json)



    def parse(self, response_text):

        content = BeautifulSoup(response_text, features="html.parser")

        deck_text = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})

        cards_info = self.get_cards_info(deck_text)



    def run(self):

        response = self.fetch()
        self.parse(response.text)



if __name__ == "__main__":

    scraper = ZillowScraper(url, headers, params)
    scraper.run()

输出

{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '11615 River Run Cir, Henderson, CO 80640', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '2,001'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '11615 River Run Cir', 'addressLocality': 'Henderson', 'addressRegion': 'CO', 'postalCode': '80640'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.908753, 'longitude': -104.851576}, 'url': 'https://www.zillow.com/homedetails/11615-River-Run-Cir-Henderson-CO-80640/49457209_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '5089 Enid Way, Denver, CO 80239', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,852'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '5089 Enid Way', 'addressLocality': 'Denver', 'addressRegion': 'CO', 'postalCode': '80239'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.784449, 'longitude': -104.815903}, 'url': 'https://www.zillow.com/homedetails/5089-Enid-Way-Denver-CO-80239/13271929_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '6088 S Pierson Ct, Littleton, CO 80127', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,810'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '6088 S Pierson Ct', 'addressLocality': 'Littleton', 'addressRegion': 'CO', 'postalCode': '80127'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.605764, 'longitude': -105.123466}, 'url': 'https://www.zillow.com/homedetails/6088-S-Pierson-Ct-Littleton-CO-80127/13818492_zpid/'}

4

2 回答 2

3

结果存储在<script>页面内的变量中。要解析它们,您可以使用下一个示例:

import json
import requests
from bs4 import BeautifulSoup


url = "https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState={%22pagination%22%3A{}%2C%22mapBounds%22%3A{%22west%22%3A-106.97384791227731%2C%22east%22%3A-102.82925562712106%2C%22south%22%3A39.18758562803622%2C%22north%22%3A40.241821806991595}%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A{%22hoa%22%3A{%22max%22%3A200}%2C%22con%22%3A{%22value%22%3Afalse}%2C%22apa%22%3A{%22value%22%3Afalse}%2C%22sch%22%3A{%22value%22%3Atrue}%2C%22ah%22%3A{%22value%22%3Atrue}%2C%22sort%22%3A{%22value%22%3A%22globalrelevanceex%22}%2C%22land%22%3A{%22value%22%3Afalse}%2C%22schu%22%3A{%22value%22%3Afalse}%2C%22manu%22%3A{%22value%22%3Afalse}%2C%22schr%22%3A{%22value%22%3Afalse}%2C%22apco%22%3A{%22value%22%3Afalse}%2C%22basf%22%3A{%22value%22%3Atrue}%2C%22schc%22%3A{%22value%22%3Afalse}%2C%22schb%22%3A{%22min%22%3A%227%22}}%2C%22isListVisible%22%3Atrue}"
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}

soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")

data = json.loads(
    soup.select_one("script[data-zrr-shared-data-key]")
    .contents[0]
    .strip("!<>-")
)

# uncomment this to print all data:
# print(json.dumps(data, indent=4))

for result in data["cat1"]["searchResults"]["listResults"]:
    print(
        "{:<15} {:<50} {:<15}".format(
            result["statusText"], result["address"], result["price"]
        )
    )

印刷:

House for sale  6092 S Marshall Dr, Littleton, CO 80123            $680,000       
House for sale  3050 S Roslyn St, Denver, CO 80231                 $774,900       
House for sale  15538 Greenstone Cir, Parker, CO 80134             $590,000       
House for sale  7141 Fenton Cir, Arvada, CO 80003                  $549,500       
House for sale  7823 S Logan Dr, Littleton, CO 80122               $665,000       
House for sale  1825 Clermont St, Denver, CO 80220                 $599,900       
House for sale  408 S Locust St, Denver, CO 80224                  $550,000       
House for sale  8660 De Soto St, Denver, CO 80229                  $450,000       
House for sale  1811 S Humboldt St, Denver, CO 80210               $675,000       
House for sale  7329 E Easter Ave, Centennial, CO 80112            $699,900       
House for sale  13638 W Montana Pl, Lakewood, CO 80228             $600,000       
House for sale  8296 E Hinsdale Dr, Centennial, CO 80112           $699,900       
House for sale  10325 Ravenswood Ln, Highlands Ranch, CO 80130     $660,000       
House for sale  2833 E 90th Pl, Denver, CO 80229                   $445,000       
House for sale  5756 W 8th Ave, Lakewood, CO 80214                 $600,000       
House for sale  6088 S Pierson Ct, Littleton, CO 80127             $509,000       
House for sale  2829 S Lowell Blvd, Denver, CO 80236               $475,000       
House for sale  604 Eldridge St, Golden, CO 80401                  $650,000       
House for sale  7171 McIntyre Ct, Arvada, CO 80007                 $850,000       
House for sale  1301 S Blackhawk Way, Aurora, CO 80012             $500,000       
House for sale  215 S Julian St, Denver, CO 80219                  $350,000       
House for sale  7095 E 67th Ave, Commerce City, CO 80022           $440,000       
House for sale  8248 S Yukon St, Littleton, CO 80128               $695,000       
House for sale  2846 S Macon Ct, Aurora, CO 80014                  $520,000       
House for sale  9340 Burgundy Cir, Littleton, CO 80126             $799,000       
House for sale  2072 S Cathay Way, Aurora, CO 80013                $560,000       
House for sale  1317 W 85th Ave, Federal Heights, CO 80260         $405,000       
House for sale  6701 Eagle Shadow Ave, Brighton, CO 80602          $1,145,000     
House for sale  2900 Webster St, Wheat Ridge, CO 80033             $660,000       
House for sale  3943 S Allison Ct, Lakewood, CO 80235              $799,950       
House for sale  511 E Irwin Ave, Littleton, CO 80122               $624,500       
House for sale  4700 E Montana Pl, Denver, CO 80222                $600,000       
House for sale  2344 S Gray Dr, Lakewood, CO 80227                 $585,000       
House for sale  5546 E 130th Dr, Thornton, CO 80241                $490,000       
House for sale  2270 S Joyce St, Lakewood, CO 80228                $1,340,000     
House for sale  12171 W Dakota Dr, Lakewood, CO 80228              $600,000       
House for sale  6641 Miller St, Arvada, CO 80004                   $625,000       
House for sale  3220 W Nevada Pl, Denver, CO 80219                 $510,000       
House for sale  8630 W 64th Pl, Arvada, CO 80004                   $447,000       
House for sale  5890 Wood Sorrel Dr, Littleton, CO 80123           $975,000       
于 2021-09-02T11:19:32.390 回答
0

如果上面的代码给你错误,试试这个

response = requests.get(ZILLOW_URL, headers=headers).content

soup = BeautifulSoup(response, 'html.parser')

data = json.loads(
    soup.select_one("script[data-zrr-shared-data-key]")
    .contents[0]
    .strip("!<>-")
)
all_data = data['cat1']['searchResults']['listResults']


for i in range(len(all_data)):
    #some items have the 'price' key nested inside units key, while others have simply inside data key
    try:
        price = all_data[i]['units'][0]['price']
    except KeyError:
        price = all_data[i]['price']
    address = all_data[i]['address']

    link = all_data[i]['detailUrl']
    # sometimes the link does not contain the starting website url, thats why we are inserting "https://www.zillow.com{link}" at the starting of link
    if 'http' not in link:
        link_to_buy = f"https://www.zillow.com{link}"
    else:
        link_to_buy = link
   
    print(price)
    print(address)
    print(link_to_buy)
    print("\n")

于 2022-02-23T17:33:45.880 回答