2

我想将使用 API 提取的维基百科内容转换为纯文本。

任何提示?

4

2 回答 2

1

据说有一些 python mediawiki标记解析器/渲染器,您几乎可以将 HTML 转换为您需要的样式的纯文本。不过,不知道这实际上效果如何。

于 2010-09-04T12:25:03.443 回答
0

几天前我做了这个来克隆一个维基媒体网站

import re
from mediawikitools import *
import os
from sys import argv

def list_all_pages(site):
    query_results = api.APIRequest(site, {'action':'query', 'list':'allpages', 'aplimit':'500'}).query()
    results = query_results['query']['allpages']
    return results

def clone(site):
    if not os.path.exists(site.siteinfo['sitename'][:20]):
        print 'Make Dir', site.siteinfo['sitename'][:20]
        os.makedirs(site.siteinfo['sitename'][:20])
    index = open(site.siteinfo['sitename'][:20] + '/' + 'Index','w')

    pages = list_all_pages(site)
    for test_page in pages:
        if test_page['title'].rfind('/') != -1 and not os.path.exists(site.siteinfo['sitename'][:20] + '/' + test_page['title'][:test_page['title'].rfind('/')+1]):
            #print test_page['title'][:test_page['title'].rfind('/')+1]
            os.makedirs(site.siteinfo['sitename'][:20] + '/' + test_page['title'][:test_page['title'].rfind('/')+1])
        page_file = open(site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki', 'w')
        try:
            index.write(site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki')
            wiki_file = page.Page(site, test_page['title'])
            print site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki'
            page_file.write(wiki_file.getWikiText())
        except KeyError, e:
            print e
        except UnicodeEncodeError, e:
            print e

if __name__ == '__main__':
    site = wiki.Wiki("http://localhost/wiki/api.php")
    site.setUserAgent('Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1')
    print site.siteinfo['sitename']
    clone(site)


    #site.login(username, password, force=true) if you need a username and password to acess it
于 2012-11-02T16:56:36.390 回答