-1

我下面的代码获取每个健身房的街道地址,但是健身房开放时间的输出间距有错误。关于我哪里出错的任何想法?

import urlparse

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')

atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        gymurl = urlparse.urljoin(sitemap, link)
        sitemap_content = requests.get(gymurl).content
        soup = BeautifulSoup(sitemap_content, 'html.parser')
        gymrow = [ gymurl ]

        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours)
                        break
        print(gymrow)
        gymwriter.writerow(gymrow)
        time.sleep(3)

感谢您的帮助!

4

3 回答 3

1

您要选择包含td元素的(类club-titlea元素,并提取href属性。

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import urllib.parse
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
res = requests.get(sitemap).content
soup = BeautifulSoup(res, 'html.parser')

# The rows in the table of gyms are formatted like so:
# <tr>
# <td class="club-title"><a href="/gyms/albertville-al"><strong>Albertville, AL</strong> <p>5850 US Hwy 431</p></a></td>
# <td class="club-join"><div class="button"><a href="/gyms/albertville-al/offers" title="Join Albertville, AL">Join Now</a></div></td>
# </tr>

# This will find all the links to all the gyms.
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        # Follow the link to this gym
        gymurl = urllib.parse.urljoin(sitemap, link)
        res = requests.get(gymurl).content
        soup = BeautifulSoup(res, 'html.parser')
        gymrow = [ gymurl ]

        # The address of this gym.
        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        # The hours of this gym.
        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours.replace('<br>', '').replace('\n', ', '))
                        break

        gymwriter.writerow(gymrow)
        time.sleep(3)

当我运行它时,我得到:

$ more gyms.csv

https://www.planetfitness.com/gyms/albertville-al,5850 US Hwy 431,Albertville,AL,35950,United States,"Monday-Friday 6am-9pm, Sat
urday-Sunday 7am-7pm"
https://www.planetfitness.com/gyms/alexander-city-al,987 Market Place,Alexander City,AL,35010,United States,Convenient hours whe
n we reopen
https://www.planetfitness.com/gyms/bessemer-al,528 W Town Plaza,Bessemer,AL,35020,United States,Convenient hours when we reopen
https://www.planetfitness.com/gyms/birmingham-crestline-al,4500 Montevallo Rd,Birmingham,AL,35210,United States,Convenient hours
 when we reopen
.
.
.
于 2020-05-17T23:46:28.047 回答
0

要尝试调试它,您应该首先打印出 atags 的值。您正在搜索其类别不存在的所有a标签。标签没有类,但它们的父clubs-list类有类。atdclub-title

你可以尝试这样的事情。

res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')

tds = soup.find_all('td', {'class': 'club-title'})
links = [td.find('a')['href'] for td in tds]
keywords = ['gyms']

for link in links:
    if any(keyword in link for keyword in keywords):
        print(link)
于 2020-05-17T23:25:46.667 回答
0

这将获得该页面上的每个链接和地址。看起来如果您想找到有关每个俱乐部的更多信息,您将不得不反复浏览并加载每个页面。

from bs4 import BeautifulSoup
import requests

res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')

atags = soup.find_all('td', {'class':'club-title'})

links = [(atag.find('a')['href'], atag.find('p').text) for atag in atags)]


[print(link) for link in links]
于 2020-05-17T23:34:28.630 回答