0

我下面的代码获取每个健身房的街道地址,但是健身房开放时间的输出间距有错误。关于我哪里出错的任何想法?

import urlparse

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')

atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        gymurl = urlparse.urljoin(sitemap, link)
        sitemap_content = requests.get(gymurl).content
        soup = BeautifulSoup(sitemap_content, 'html.parser')
        gymrow = [ gymurl ]

        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours)
                        break
        print(gymrow)
        gymwriter.writerow(gymrow)
        time.sleep(3)

谢谢您的帮助!

4

2 回答 2

-1

这是您要尝试的工作代码:

>>>res1 = requests.get(urljoin('https://www.planetfitness.com/', link)).content # ie, the one of the url is 'https://www.planetfitness.com/gyms/albertville-al'
>>>soup1 = BeautifulSoup(res1, 'html.parser')

>>>ps = soup1.find('p',class_ = 'address')
>>>ps
<p class="address" itemprop="address" itemscope="" itemtype="http://schema.org/PostalAddress"><span class="address-line1" itemprop="streetAddress">5850 US Hwy 431</span><br/>
<span class="locality" itemprop="addressLocality">Albertville</span>, <span class="administrative-area" itemprop="addressRegion">AL</span> <span class="postal-code" itemprop="postalCode">35950</span><br/>
<span class="country" itemprop="addressCountry">United States</span></p>

>>>address1 = [p['itemprop'] for p in ps.findAll('span')]
>>>address1
['streetAddress', 'addressLocality', 'addressRegion', 'postalCode', 'addressCountry']
于 2020-05-18T02:47:49.550 回答
-1

这将为您提供一个 dict,其中键作为 itemprop,值作为地址项文本,我在 colab 中使用 python3,所以我更改了一些导入,但循环的最后 4 行应该可以得到你想要的:

import requests
import urllib
from bs4 import BeautifulSoup


res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')

tds = soup.find_all('td', {'class': 'club-title'})
links = [td.find('a')['href'] for td in tds]
keywords = ['gyms']

gym_data = []

for link in links:
    if any(keyword in link for keyword in keywords):
        req = urllib.parse.urljoin('https://www.planetfitness.com/', link)
        res = requests.get(req).content
        site = BeautifulSoup(res)
        ps = site.find('p', class_='address')
        address_dict = {p['itemprop']: p.text for p in ps.findAll('span')}
        gym_data.append(address)

输出(可能不需要所有这些键......:

[{'streetAddress': '5850 US Hwy 431', 'addressLocality': 'Albertville', 'addressRegion': 'AL', 'postalCode': '35950', 'addressCountry': 'United States'}
{'streetAddress': '987 Market Place', 'addressLocality': 'Alexander City', 'addressRegion': 'AL', 'postalCode': '35010', 'addressCountry': 'United States'}
{'streetAddress': '528 W Town Plaza', 'addressLocality': 'Bessemer', 'addressRegion': 'AL', 'postalCode': '35020', 'addressCountry': 'United States'}
{'streetAddress': '4500 Montevallo Rd', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35210', 'addressCountry': 'United States'}
{'streetAddress': '140 Wildwood Pkwy', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35209', 'addressCountry': 'United States'}
{'streetAddress': '168 Inverness Plaza', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35242', 'addressCountry': 'United States'}
{'streetAddress': '9118 Parkway E', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35206', 'addressCountry': 'United States'}
{'streetAddress': '1727 2nd Ave SW', 'addressLocality': 'Cullman', 'addressRegion': 'AL', 'postalCode': '35055', 'addressCountry': 'United States'}
{'streetAddress': '29685 Renaissance Blvd', 'addressLocality': 'Daphne', 'addressRegion': 'AL', 'postalCode': '36526', 'addressCountry': 'United States'}
{'streetAddress': '809 Beltline Road SW Suite B', 'addressLocality': 'Decatur', 'addressRegion': 'AL', 'postalCode': '35601', 'addressCountry': 'United States'}
{'streetAddress': '3121 Ross Clark Circle', 'addressLocality': 'Dothan', 'addressRegion': 'AL', 'postalCode': '36303', 'addressCountry': 'United States'}
{'streetAddress': '913 Rucker Blvd', 'addressLocality': 'Enterprise', 'addressRegion': 'AL', 'postalCode': '36330', 'addressCountry': 'United States'}
...
]
于 2020-05-18T02:57:16.720 回答