我正在尝试获取一个网页,其中包含带有上标的数字、分子和分母格式的分数。获取的内容正在保存到文档文件中。为此,我正在使用 python-docx 模块。例如。表达式 112 x 5 4保存为 112 x 54 & fraction ¼ as 1 (newline)(tab) 4. 我使用的是 Python 3.5 & Windows 8.1 OS
代码-
# This script heavily relies on the HTML structure of webpages mentioned below
# indiabix.com/aptitude/simple-interest, indiabix.com/aptitude/numbers which are similar
import sys
import urllib.request
from docx import Document
from bs4 import BeautifulSoup
soup = ""
para = ""
root = "http://www.indiabix.com"
def getQuestions(link):
req = urllib.request.urlopen(link)
soup = BeautifulSoup(req.read(),'html5lib')
boxes = soup.find_all('div',{'class':'bix-div-container'})
for box in boxes:
headers = box.find("td",{"class":"bix-td-qtxt"})
para = doc.add_paragraph(headers.get_text())
options = box.find_all("td",{'class':'bix-td-option'})
para.text += "\n"
for option in options:
para.text = para.text + (option.get_text()+" ")
return soup
def getExtras(soup):
main = soup.find('p',{'class':'ib-pager'}).find_all('a')
for m in main[:-1]:
getQuestions(root+m['href'])
return
if __name__ == '__main__':
# Here, the string "http://indiabix.com/aptitude/numbers" is used as the input link
link = input("Enter link : ")
doc = Document()
soup = getQuestions(link)
getExtras(soup)
doc.save('questions.docx')