您可以使用BeautifulSoup
(我urllib
用来与网站通信,因为我不熟悉,selenium
但我确信它可以工作)对 HTML 进行一些简单的解析:
import urllib
from bs4 import BeautifulSoup
# urllib opener
opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0))
# Get page
html = opener.open("http://qr.ae/Rkplrt").read()
# Create BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
# Find the HTML element you want
answer = soup.find('div', { 'class' : 'ExpandedQText ExpandedAnswer' })
# Remove the stuff you don't want
answer.find('td', { 'class' : 'linenos' }).extract()
answer.find('div', { 'class' : 'ContentFooter AnswerFooter' }).extract()
# Print
print("\n".join(answer.stripped_strings))
我不完全确定您要提取什么。上面只给出了答案,包括代码,没有行号:
This is:
#include <stdio.h>
int v,i,j,k,l,s,a[99];
main()
{
for(scanf("%d", &s);*a-s;v=a[j*=v]-a[i],k=i<s,j+=(v=j<s&&(!k&&!!printf(2+"\n\n%c"-(!l<<!j)," #Q"[l^v?(l^j)&1:2])&&++l||a[i]<s&&v&&v-i+j&&v+i-j))&&!(l%=s),v||(i==j?a[i+=k]=0:++a[i])>=s*k&&++a[--i]);
}
更新: OP 要求<a>
和<img>
标签被它们的href
和src
值替换。下面我的脚本版本应该解决这个问题。它还处理多个答案。
import urllib
from bs4 import BeautifulSoup
# urllib opener
opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0))
# Get page
html = opener.open("https://www.quora.com/Is-it-too-late-for-an-X-year-old-to-learn-how-to-program").read()
# Create BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
# Place to store the final output
output = ''
# Find the HTML element you want
answers = soup.find_all('div', { 'class' : 'ExpandedQText ExpandedAnswer' })
for answer in answers:
# Remove the stuff you don't want
linenos = answer.find('td', { 'class' : 'linenos' })
if linenos is not None:
linenos.extract()
answer.find('div', { 'class' : 'ContentFooter AnswerFooter' }).extract()
# Replace <a> with its url
for link in answer.select('a'):
url = link['href']
link.insert_after(url)
link.extract()
# Replace <a> with its url
for img in answer.select('img'):
url = img['src']
img.insert_after(url)
img.extract()
# Attach to output
output += "\n".join(answer.stripped_strings) + '\n\n'
# Print
print(output)