我想将使用 API 提取的维基百科内容转换为纯文本。
任何提示?
据说有一些 python mediawiki标记解析器/渲染器,您几乎可以将 HTML 转换为您需要的样式的纯文本。不过,不知道这实际上效果如何。
几天前我做了这个来克隆一个维基媒体网站
import re
from mediawikitools import *
import os
from sys import argv
def list_all_pages(site):
query_results = api.APIRequest(site, {'action':'query', 'list':'allpages', 'aplimit':'500'}).query()
results = query_results['query']['allpages']
return results
def clone(site):
if not os.path.exists(site.siteinfo['sitename'][:20]):
print 'Make Dir', site.siteinfo['sitename'][:20]
os.makedirs(site.siteinfo['sitename'][:20])
index = open(site.siteinfo['sitename'][:20] + '/' + 'Index','w')
pages = list_all_pages(site)
for test_page in pages:
if test_page['title'].rfind('/') != -1 and not os.path.exists(site.siteinfo['sitename'][:20] + '/' + test_page['title'][:test_page['title'].rfind('/')+1]):
#print test_page['title'][:test_page['title'].rfind('/')+1]
os.makedirs(site.siteinfo['sitename'][:20] + '/' + test_page['title'][:test_page['title'].rfind('/')+1])
page_file = open(site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki', 'w')
try:
index.write(site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki')
wiki_file = page.Page(site, test_page['title'])
print site.siteinfo['sitename'][:20] + '/' + test_page['title']+'.wiki'
page_file.write(wiki_file.getWikiText())
except KeyError, e:
print e
except UnicodeEncodeError, e:
print e
if __name__ == '__main__':
site = wiki.Wiki("http://localhost/wiki/api.php")
site.setUserAgent('Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1')
print site.siteinfo['sitename']
clone(site)
#site.login(username, password, force=true) if you need a username and password to acess it