你可以使用lxml.html
:
from lxml import html
url = "http://site.com"
doc = html.parse(url).getroot() # download & parse webpage
doc.make_links_absolute(url)
for element, attribute, link, _ in doc.iterlinks():
if (attribute == 'href' and element.tag == 'a' and
'somepage' in link): # or e.g., re.search('somepage', link)
print(link)
或同样使用beautifulsoup4
:
import re
try:
from urllib2 import urlopen
from urlparse import urljoin
except ImportError: # Python 3
from urllib.parse import urljoin
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer # pip install beautifulsoup4
url = "http://site.com"
only_links = SoupStrainer('a', href=re.compile('somepage'))
soup = BeautifulSoup(urlopen(url), parse_only=only_links)
urls = [urljoin(url, a['href']) for a in soup(only_links)]
print("\n".join(urls))