Python脚本:
#!/usr/bin/python3.2
import re, sys
import requests
import time as ti
from bs4 import BeautifulSoup as bs
base_url = 'http://www.newegg.com'
user_agent = ('Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)')
#http://www.newegg.com/Computer-Hardware/Store
#http://www.newegg.com/CPUs-Processors/Category/ID-34
#http://www.newegg.com/Processors-Desktops/SubCategory/ID-343
#http://www.newegg.com/Processors-Servers/SubCategory/ID-727
#http://www.newegg.com/Processors-Mobile/SubCategory/ID-759
header = {
'Host': 'www.newegg.com',
'User-Agent': user_agent,
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip,deflate',
'Referer': 'http://www.newegg.com/Processors-Desktops/SubCategory/ID-343',
'Connection': 'keep-alive',
'DNT': '1',
'Content-Type': 'application/x-www-form-urlencoded',
}
login_data_1 = {
'Pagesize' : '10',
}
def write_file(file, data):
content = open(file, 'wb').write(data)
def get_page(s, req_type, url, header, params, data):
r = s.request(req_type, url, headers=header, params=params, data=data)
return r.text.encode('utf-8')
def login():
s = requests.Session()
#Send preparatory login cmd
url = base_url +\
'/Processors-Desktops/SubCategory/ID-343'
lc_content = get_page(s, 'get', url, header=header, params=login_data_1, data=None)
print(lc_content)
write_file('egg.data', lc_content)
import re
pat = re.compile(b'titleDescriptionID[0-9]+')
f = open('egg.data', 'rb')
content = f.read()
content = content.decode('utf-8', 'replace')
content = ''.join([x for x in content if ord(x) < 128])
soup = bs(content)
import re
id_pat = re.compile('titleDescriptionID[0-9]*')
def description_id(tag):
if tag.name == 'span':
if tag.get('class'):
if tag.get('class')[0] == 'itemDescription':
if re.match(id_pat, tag.get('id')):
print(tag.string)
if tag.name == 'strong':
print(tag.string)
if tag.name == 'sup':
print(tag.string)
items = soup.find_all(description_id)
#login()
数据:
<div class="itemText">
<div class="wrapper">
<span class="itemPromo">Customer Choice Award Winner</span>
<a href="http://www.newegg.com/Product/Product.aspx?Item=N82E16819116501" title="View Details" >
<span class="itemDescription" id="titleDescriptionID" style="display:inline">Intel Core i7-3770K Ivy Bridge 3.5GHz (3.9GHz Turbo) LGA 1155 77W Quad-Core Desktop Processor Intel HD Graphics 4000 BX80637I73770K</span>
<span class="itemDescription" id="lineDescriptionID" style="display:none">Intel Core i7-3770K Ivy Bridge 3.5GHz (3.9GHz Turbo) LGA 1155 77W Quad-Core Desktop Processor Intel HD Graphics 4000 BX80637I73770K</span>
</a>
</div>
<ul class="itemFeatures" ><li> 22 nm Ivy Bridge 77W</li><li> 8MB L3 Cache</li><li> 4 x 256KB L2 Cache</li>
</ul>
<ul class="featureList" >
<li><b>Series:</b> Core i7</li><li><b>L2 Cache:</b> 4 x 256KB</li><li><b>L3 Cache:</b> 8MB</li><li><b>Manufacturing Tech:</b> 22 nm</li>
<li><b>Model #: </b>BX80637I73770K</li>
<li><b>Item #: </b>N82E16819116501</li>
<li><b>Return Policy: </b><a href="http://www.newegg.com/HelpInfo/ReturnPolicy.aspx#39" target="_blank" title="CPU Replacement Only Return Policy(New Window)">CPU Replacement Only Return Policy</a></li>
</ul>
</div>
<div class="itemAction">
<ul class="price price-product-cells" >
<li class="price-was " >
</li>
<li class="price-map" ></li>
<li class="price-current " >
<span class="price-current-label"></span>
$<strong>319</strong><sup>.99</sup>
<span class="price-current-range">
<abbr title="to">–</abbr>
</span>
</li>
<li class="price-save " >
</li>
<li class="price-note">
</li>
<li class="price-ship">Free Shipping</li>
</ul>
任何清理方法:description_id()?或者更好的方法来实现同样的事情。
使用bs4我正在提取特定标签的字符串,其中:id=titleDescriptionID[0-9]+但我还必须提取价格<strong> and <sup>
在较早的线程中,人们建议执行 find_all() 但我认为您不能将正则表达式对象作为值传递: BeautifulSoup: <div class <span class></span><span class>TEXT I WANT</span >
这就是为什么我决定改为传递一个函数。