python - 从网页中提取类中的链接

Question

我正在尝试使用此 python 代码从博客中提取链接：

    #!/usr/bin/env python

"""
Extract all links from a web page
=================================
Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
GitHub: https://github.com/jabbalaci/Bash-Utils

Given a webpage, extract all links.

Usage:
------
./get_links.py <URL>
"""

import sys
import urllib
import urlparse

from BeautifulSoup import BeautifulSoup


class MyOpener(urllib.FancyURLopener):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15'


def process(url):
    myopener = MyOpener()
    #page = urllib.urlopen(url)
    page = myopener.open(url)

    text = page.read()
    page.close()

    soup = BeautifulSoup(text)

    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(url, tag['href'])
        print tag['href']
# process(url)


def main():
    if len(sys.argv) == 1:
        print "Jabba's Link Extractor v0.1"
        print "Usage: %s URL [URL]..." % sys.argv[0]
        sys.exit(1)
    # else, if at least one parameter was passed
    for url in sys.argv[1:]:
        process(url)
# main()

#############################################################################

if __name__ == "__main__":
    main()

链接来自主要类别为 blog.xx/Music/ 的博客，它将从 blog.xx/this_album_name/ 类别中提取链接，但我想从类别下的子页面上名为 quote 的类中获取链接

我如何解析音乐类别中的链接并让 BS 遍历每个标题链接以使用引用类提取下一页上的链接？

即 blog.xx/Category

blog.xx/post1.html

blog.xx/post2.html

在上述每个帖子页面上都有一个引用块，其中包含我想抓取的链接。

我是 python 和 BS 的新手，已经尝试了一些变化，但在这一点上我需要帮助。谢谢

score 1 · Accepted Answer

如果我对您的理解正确，您是否希望将页面中的链接跟随到下一页，并从该页面中抓取链接？以下应该为您执行此操作：

#!/usr/bin/env python

"""
Extract all links from a web page
=================================
Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
GitHub: https://github.com/jabbalaci/Bash-Utils

Given a webpage, extract all links.

Usage:
------
./get_links.py <URL>
"""

import sys
import urllib
import urlparse
import re

from BeautifulSoup import BeautifulSoup


class MyOpener(urllib.FancyURLopener):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15'


def process(url):
    myopener = MyOpener()
    #page = urllib.urlopen(url)
    page = myopener.open(url)

    text = page.read()
    page.close()

    soup = BeautifulSoup(text)

    urls = []

    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(url, tag['href'])
        urls.append(tag['href'])

    return urls

# process(url)


def main():


    # Store the urls we were given
    urls_queue = sys.argv[1:] 
    urls_found = []
    urls_done = []

    site_roots = []

    # Get the domains to keep us on the same domain (don't follow external links)
    for url in urls_queue:
        mre = re.match('^https?://[^/]*',url,re.IGNORECASE)
        if mre:
            # If we've found a match, add the entire matched string to site_roots
            site_roots.append( mre.group(0) )

    while len(urls_queue) > 0:

        # Get url off the top of the queue
        url = urls_queue.pop()
        urls_done.append(url)

        found = process(url)

        for uf in found:
            # I'd suggest checking to make sure it's on the same domain here
            # any() returns true if any of the elements in the list passed are True
            # In this case, if uf starts with any of the site_root strings.
            # 'not any()' is equivalent to saying 'none'
            if not any( [ uf.startswith( site_root ) for site_root in site_roots ] ):
                continue # Next url, this is off site

            if uf not in urls_found:
                urls_found.append(uf) 

            # If we don't have it in the queue, queue it up
            if uf not in urls_queue and uf not in urls_done:
                urls_queue.append(uf)

        print "Done %d; Queued %d; Found %d" % ( len(urls_done), len(urls_queue), len(urls_found) )

    print urls_found
# main()

#############################################################################

if __name__ == "__main__":
    main()

我添加了一个 url 队列和一个检查，以确保您不会在指向其他地方的链接之后“离开现场”。它输出最后找到的所有内容。

请注意，此代码将跟随辅助页面上的链接，因此可能会将整个站点编入索引。urls_queue.append您可以通过在主循环中注释掉该位while（停止添加更多）来解决此问题。然后在while循环之前添加：

urls_queue = [url for inurl in sys.argv[1:] for url in process(inurl) if any([url.startswith(sr) for sr in site_roots])]
urls_queue = list( set(urls_queue) ) # Get rid of duplicates

这将构建初始队列，在提供的页面中添加链接。因此，对于您的示例，将添加类别页面上的链接，但不会添加后续页面上的链接。

python - 从网页中提取类中的链接

1 回答 1

Related

Reference