下面的代码是从 ESPN/college-football 中提取头条新闻。我进入文章本身并提取p
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import datetime
import smtplib
# Copy all of the content from the provided web page
webpage = urlopen('http://espn.go.com/college-football').read()
soup = BeautifulSoup(webpage)
now = datetime.datetime.now()
# to get the contents of <ul> tags w/ attribute class="headlines":
for i in soup.findAll('ul', {'class': 'headlines'}):
for tag in i.findAll('li'):
for a in tag.findAll({'a' : True, 'title' : False}):
print a.text
print a['href']
print "\n"
articlePage = urlopen(a['href']).read() # Grab all of the content from original article
# Pass the article to the Beautiful Soup Module
soup1 = BeautifulSoup(articlePage)
# Tell Beautiful Soup to locate all of the p tags and store them in a list
paragList = soup1.findAll('p')
# Print all of the paragraphs to screen
for z in paragList:
print z.text
print "\n"
# -*- coding: utf-8 -*-
from email.header import Header
from email.mime.text import MIMEText
msg = MIMEText(a.text + "\n" + str(a.get('href') + "\n" + z.text), 'plain', 'utf-8')
msg['Subject'] = Header('ESPN Scrape from: '+ now.strftime("%Y-%m-%d %H:%M"), 'utf-8')
msg['From'] = 'FROM'
msg['To'] = 'TO'
# Credentials (if needed)
username = 'username'
password = 'password'
from smtplib import SMTP_SSL
# send it via gmail
s = SMTP_SSL('smtp.gmail.com')
s.login(username, password)
s.sendmail(msg['From'], msg['To'], msg.as_string())