1

Python版本: 2.7.10

我的代码:

# -*- coding: utf-8 -*-

from urllib2 import urlopen
from bs4 import BeautifulSoup
from collections import OrderedDict
import re
import string

def cleanInput(input):
    input = re.sub('\n+', " ", input)
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    # input = bytes(input, "UTF-8")
    input = bytearray(input, "UTF-8")
    input = input.decode("ascii", "ignore")

    cleanInput = []
    input = input.split(' ')

    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = []

    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
html = urlopen(url)
bsObj = BeautifulSoup(html, 'lxml')
content = bsObj.find("div", {"id": "mw-content-text"}).get_text()
ngrams = ngrams(content, 2)
keys = range(len(ngrams))
ngramsDic = {}
for i in range(len(keys)):
    ngramsDic[keys[i]] = ngrams[i]
# ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
ngrams = OrderedDict(sorted(ngramsDic.items(), key=lambda t: t[1], reverse=True))


print ngrams
print "2-grams count is: " + str(len(ngrams))

我最近学习如何通过Python 进行 Web 抓取:从现代 Web 收集数据一书学习如何进行网页抓取,而在第 7 章数据规范化部分,我首先编写了与书中显示的代码相同的代码,并从终端收到错误:

Traceback (most recent call last):
  File "2grams.py", line 40, in <module>
    ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
AttributeError: 'list' object has no attribute 'items'

因此,我通过创建一个新字典来更改代码,其中实体是ngrams. 但我得到了完全不同的结果:

在此处输入图像描述

问题:

  1. 如果我想得到书中显示的结果(按值和频率排序),我应该编写自己的行来计算每个 2 克的出现,还是书中的代码已经具有该功能(代码在书是python 3代码)?在 github 上预订示例代码
  2. 我输出的频率与作者的相差很大,例如出现了[u'Software', u'Foundation']37 次而不是 40 次。导致这种差异的原因是什么(可能是我的代码错误)?

图书截图:

图书截图1图书截图2

4

5 回答 5

1

I got the same problem when I read this book.ngrams should be dict. python version 3.4

here is my code:

from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import OrderedDict
import re
import string

def cleanInput(input):
    input = re.sub('\n+',' ', input)
    input = re.sub('\[0-9]*\]', '', input)
    input = re.sub('\+', ' ', input)
    input = bytes(input, 'utf-8')
    input = input.decode('ascii', 'ignore')
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) >1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "lxml")
content = bsObj.find("div", {"id": "mw-content-text"}).get_text()
ngrams1 = ngrams(content, 2)
#ngrams1  is something like this [['This', 'article'], ['article', 'is'], ['is', 'about'], ['about', 'the'], ['the', 'programming'], ['programming', 'language'],
ngrams = {}
for i in ngrams1:
    j = str(i)   #the key of ngrams should not be a list
    ngrams[j] = ngrams.get(j, 0) + 1
    # ngrams.get(j, 0) means return a value for the given key j. If key j is not available, then returns default value 0.
    # when key j appear again, ngrams[j] = ngrams[j]+1

ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
print(ngrams)
print("2-grams count is:"+str(len(ngrams)))

This is a part of my result:

OrderedDict([("['Python', 'Software']", 37), ("['Software', 'Foundation']", 37), ("['of', 'the']", 37), ("['of', 'Python']", 35), ("['Foundation', 'Retrieved']", 32),
于 2016-03-28T06:31:39.143 回答
1

本章也有一个错误,因为 ngrams 是一个列表。我将它转换为 dict 并且它有效

ngrams1 = OrderedDict(sorted(dict(ngrams1).items(), key=lambda t: t[1], reverse=True))
于 2016-01-10T20:44:25.247 回答
0

更优雅的解决方案是使用collections.defaultdict

这是我的代码(使用 Python 2.7+):

import requests
import re
import string
from bs4 import BeautifulSoup
from collections import OrderedDict, defaultdict


def clean_input(input):
    input = re.sub('\n+', " ", input)
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input).decode(encoding='utf-8')
    input = input.encode(encoding='ascii', errors='ignore')
    clean_input = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            clean_input.append(item)
    return clean_input


def ngrams(input, n):
    input = clean_input(input)
    output = []
    for i in xrange(len(input)-n+1):
        output.append(input[i:i+n])
    return output


response = requests.get("http://en.wikipedia.org/wiki/Python_(programming_language")
bsObj = BeautifulSoup(response.content, "html.parser")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
ngrams1 = ngrams(content, 2)
ngrams = defaultdict(int)
for k in ngrams1:
    ngrams[str(k)] += 1 
ngrams = OrderedDict(sorted(ngrams.items(), key=(lambda t: t[1]), reverse=True))
print ngrams
print "2-grams count is: %d" % len(ngrams)

这是我的结果的一部分:

OrderedDict([("['Python', 'programming']", 5), ("['programming', 'language']", 4), ("['for', 'Python']", 3), ("['the', 'page']", 2), ("['language', 'in']", 2), ("['sister', 'projects']", 1), ("['language', 'article']", 1), ("['page', 'I']", 1), ("['see', 'Why']", 1),
于 2017-07-16T11:01:13.663 回答
0

实际上,我们的大多数编程书籍已经告诉您在哪里可以找到您正在阅读的书籍的材料或代码。

对于本书,您可以在以下位置找到所有示例代码:

http://pythonscraping.com/code/并将您重定向到

https://github.com/REMitchell/python-scraping

然后您可以在第 7 章文件夹中找到您的代码。 在此处输入图像描述 您可以在您的书中看到以下屏幕截图,以及我用蓝色框标记的示例代码的 url: 在此处输入图像描述

2-clean2grams.py 中的示例代码:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import OrderedDict

def cleanInput(input):
    input = re.sub('\n+', " ", input)
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def getNgrams(input, n):
    input = cleanInput(input)
    output = dict()
    for i in range(len(input)-n+1):
        newNGram = " ".join(input[i:i+n])
        if newNGram in output:
            output[newNGram] += 1
        else:
            output[newNGram] = 1
    return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
#ngrams = getNgrams(content, 2)
#print(ngrams)
#print("2-grams count is: "+str(len(ngrams)))

ngrams = getNgrams(content, 2)
ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
print(ngrams)

在此示例代码中,您可能会得到如下结果:

[('Python Software', 37), ('Software Foundation', 37), ...

如果你想要你的结果,比如:

[("['Python', 'Software']", 37), ("['Software', 'Foundation']", 37), ...

您只需要进行一些修改,如下所示:

在此处输入图像描述

于 2018-01-24T06:59:53.373 回答
0

列表没有项目。我只是将列表更改为 dict。这是我更改的代码

def ngrams(input, n):
    input = cleanInput(input)
    output = dict()
    for i in range(len(input)-n+1):
        new_ng = " ".join(input[i:i+n])
        if new_ng in output:
            output[new_ng] += 1
        else:
            output[new_ng] = 1
    return output
于 2017-12-09T05:04:58.293 回答