4

我正在尝试使用 Python 3.6 中的 Beautiful Soup 和请求模块来抓取网页并检索网页上嵌入视频的 URL。当我在 Chrome 中检查网页上的 HTML 时,我可以看到视频的 .mp4 链接。但是当我使用 requests 和 Beautiful Soup 获取页面时,我找不到“视频”节点。我知道视频窗口是一个嵌套的 HTML 文档。特别是,我想使用 Beautiful Soup 和 requests 模块抓取这个网页 - http://videolectures.net/icml2015_liang_language_understanding/并获取视频链接 - http://hydro.ijs.si/v012/6f/n5vruqvdwpj36mdoxxwyxvyg5hje7a4c.mp4 。任何正确方向的帮助将不胜感激。谢谢!

4

1 回答 1

0

这里有一个混淆配置:

http://videolectures.net/icml2015_liang_language_understanding/video/1/page.map

这个 url 本身是从字段构建的site_slugvideo位于JS中编码的原始页面中:

var viipg = {
    cfg: {
        slug: 'icml2015_liang_language_understanding',
        type: 'Lecture',
        obj_id: 23694,
        video: 1,
        video_id: 23648,
        videos: [1, 2],
        chrome_colors: ["FFFFFF", "F21F1F"],
        livepipe: '//videolectures.net',
        site_slug: 'vln',
        media_url: 'https://static.videolectures.net/r.1483388978/',
        sentry: '//161ef9a5c3a14af1848399909b890522@sentry.viidea.com/6'
    },
    ..........................
};

该文件使用 JS 中的算法进行解码。我在脚本中重新安排了缩小的代码:

"use strict"

var b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789~-_.,;:<>!?*+=\"'#$%&/\\()\n\t ";
var c = b.length;

function d(a) {
    var c, d = 0;
    for (c = 0; c < a.length; c += 1){
        d += b.indexOf(a[c]);
    }
    return d >=0 ? d : 0;
}
function e(a, e) {
    var f, g, h = [];
    e = d(e);
    for (var g = 0; g < a.length; g += 1) {
        f = b.indexOf(a[g]);
        if(f > -1){
            var ni = ((f - e - 6 * g) % c + c) % c;
            h.push(b[ni]);
        } else {
            h.push(a[g]);
        }
    }
    return h.join("");
}

var slug = "icml2015_liang_language_understanding";

require('http').get('http://videolectures.net/icml2015_liang_language_understanding/video/1/page.map', (res) => {
    res.setEncoding('utf8');
    res.on('data', function (body) {
        console.log(e(body.substring(1),slug));
    });
});

它运行良好并将文件解码为有效的 xml :

<?xml version="1.0" standalone="yes"?>
<smil>
<head>
    <meta name="title" content="Natural Language Understanding: Foundations and State-of-the-Art" />
    <meta name="abstract" content="Building systems that can understand human language—being able to answer questions, follow instructions, carry on dialogues—has been a long-standing challenge since the early days of AI. Due to recent advances in machine learning, there is again renewed interest in taking on this formidable task. A major question is how one represents and learns the semantics (meaning) of natural language, to which there are only partial answers. The goal of this tutorial is (i) to describe the linguistic and statistical challenges that any system must address; and (ii) to describe the types of cutting edge approaches and the remaining open problems. Topics include distributional semantics (e.g., word vectors), frame semantics (e.g., semantic role labeling), model-theoretic semantics (e.g., semantic parsing), the role of context, grounding, neural networks, latent variables, and inference. The hope is that this unified presentation will clarify the landscape, and show that this is an exciting time for the machine learning community to engage in the problems in natural language understanding." />
    <meta name="part" content="1" />
    <meta name="date" content="Oct. 28, 2015" />
    <meta name="type" content="Tutorial" />
    <layout></layout>
</head>
<body>
    <switch region="video" dur="0:58:19" type="v">
        <video id="1001866" proto="rtmp" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4"  streamer="rtmp://hydro2.videolectures.net/vod" src="mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4"/>
        <video id="1001866" proto="m3u8" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="application/x-mpegURL"  src="http://hydro2.videolectures.net/vod/_definst_/mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4/playlist.m3u8"/>
        <video id="1001866" proto="secure_download" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4"  src="http://videolectures.net/site/secure_dl/0345805d8bef53e4f3b95b58b3b9a4c2/5f811e37/ae6se7y42if65i27glt3yfanffv5rvtk/tag=1001866/icml2015_liang_language_understanding_01_400x300_h264.mp4"/>
        <video id="1001866" proto="http" width="400" height="300" systemBitrate="416819" size="182348171" apptype="v" ext="mp4" type="video/mp4"  src="http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4"/>

        <image src="http://hydro.ijs.si/v012/41/iff64giyif2niuz3bfwkkv6p7z7nko3g.jpg" width="400" height="300" type="screenshot"/>
        <image src="http://hydro.ijs.si/v012/41/ihgisuqj4vmqfygwzwuf7ifugpihurga.jpg" width="156" height="96" type="thumbnail"/>
    </switch>
</body>
</smil>

JS部分位于script-player.jssmile.min.js

以下脚本从源页面提取 JSON 参数,获取配置,解码配置并解析 xml 以获取视频 url:

import requests
from bs4 import BeautifulSoup
import re
import json

#extract JSON config
r = requests.get("http://videolectures.net/icml2015_liang_language_understanding/")
soup = BeautifulSoup(r.text, "html.parser")

extract = soup.findAll(text=re.compile("var\s+viipg\s*=\s*{"))[0]
extract = re.search(r".*cfg\s*:\s*({.*}),", extract)
jsObject = extract.group(1)

dict_str = lambda data : re.sub(r"([{,]\s*)([^\"':,]+)(\s*:)", r'\1"\2"\3', data).replace('\'', '"')

config = json.loads(dict_str(jsObject))

print(config)

#extract xml config
r = requests.get(f'http://videolectures.net/{config["slug"]}/video/{config["video"]}/page.map')

charList = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789~-_.,;:<>!?*+=\"'#$%&/\\()\n\t "
charLen = len(charList)

def updateSlug(a):
    d = 0
    for c in a:
        d += charList.find(c)
    return d if d >= 0 else 0

def parse(data, slug):
    h = []
    slug = updateSlug(slug);
    for idx, c in enumerate(data):
        f = charList.find(c)
        if f > -1:
            ni = (((f - slug - 6 * idx) % charLen) + charLen) % charLen
            h.append(charList[ni])
        else:
            h.append(c)
    return "".join(h)

#parse xml config
xmlConfig = parse(r.text[1:], config["slug"])
soup = BeautifulSoup(xmlConfig, features="xml")

allVideos = [(t["proto"], t["src"]) for t in soup.find_all("video")]
print(allVideos)

httpVideo = [t for t in allVideos if t[0]=="http"][0]
print(httpVideo)

在 repl.it 上试试这个

输出:

{'slug': 'icml2015_liang_language_understanding', 'type': 'Lecture', 'obj_id': 23694, 'video': 1, 'video_id': 23648, 'videos': [1, 2], 'chrome_colors': ['FFFFFF', 'F21F1F'], 'livepipe': '//videolectures.net', 'site_slug': 'vln', 'media_url': 'https://static.videolectures.net/r.1483388978/', 'sentry': '//161ef9a5c3a14af1848399909b890522@sentry.viidea.com/6'}
[('rtmp', 'mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4'), ('m3u8', 'http://hydro2.videolectures.net/vod/_definst_/mp4:v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4/playlist.m3u8'), ('secure_download', 'http://videolectures.net/site/secure_dl/14f30db64e8184a24b2bb4414dcfdc5d/5f8127f8/ae6se7y42if65i27glt3yfanffv5rvtk/tag=1001866/icml2015_liang_language_understanding_01_400x300_h264.mp4'), ('http', 'http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4')]
('http', 'http://hydro.ijs.si/v012/01/ae6se7y42if65i27glt3yfanffv5rvtk.mp4')
于 2020-10-10T03:45:44.450 回答