因为您的数据没有排序,所以使用一个collections.defaultdict()
对象来实现新键的列表,然后按电影标题键:
from collections import defaultdict
grouped = defaultdict(list)
for film in f:
grouped[film['film'][0]].append(film)
该film['film'][0]
值用于对电影进行分组。如果您想使用更复杂的标题分组,则必须创建该键的规范版本。
演示:
>>> from collections import defaultdict
>>> import json
>>> with open('film.json') as film_file:
... f = json.load(film_file)
...
>>> grouped = defaultdict(list)
>>> for film in f:
... grouped[film['film'][0]].append(film)
...
>>> grouped
defaultdict(<type 'list'>, {u'Street Fighter': [{u'director': [u'E. de Souza'], u'price': [u'2,00'], u'film': [u'Street Fighter'], u'year': [u'1994']}], u'Pulp Fiction': [{u'director': [u'Tarantino'], u'price': [u'20,00'], u'film': [u'Pulp Fiction'], u'year': [u'1994']}], u'Pulp Fyction': [{u'director': [u'Tarantino'], u'price': [u'15,00'], u'film': [u'Pulp Fyction'], u'year': [u'1994']}], u'The Matrix': [{u'director': [u'Wachowski'], u'price': [u'19,00'], u'film': [u'The Matrix'], u'year': [u'1999']}, {u'director': [u'Wachowski'], u'price': [u'20,00'], u'film': [u'The Matrix'], u'year': [u'1999']}], u'Blade Runner': [{u'director': [u'Ridley Scott'], u'price': [u'19,99'], u'film': [u'Blade Runner'], u'year': [u'1982']}], u'Kill Bill vol.1': [{u'director': [u'Tarantino'], u'price': [u'10,00'], u'film': [u'Kill Bill vol.1'], u'year': [u'2003']}], u'The Matrix Reloaded': [{u'director': [u'Wachowski'], u'price': [u'9,99'], u'film': [u'The Matrix Reloaded'], u'year': [u'2003']}]})
>>> from pprint import pprint
>>> pprint(dict(grouped))
{u'Blade Runner': [{u'director': [u'Ridley Scott'],
u'film': [u'Blade Runner'],
u'price': [u'19,99'],
u'year': [u'1982']}],
u'Kill Bill vol.1': [{u'director': [u'Tarantino'],
u'film': [u'Kill Bill vol.1'],
u'price': [u'10,00'],
u'year': [u'2003']}],
u'Pulp Fiction': [{u'director': [u'Tarantino'],
u'film': [u'Pulp Fiction'],
u'price': [u'20,00'],
u'year': [u'1994']}],
u'Pulp Fyction': [{u'director': [u'Tarantino'],
u'film': [u'Pulp Fyction'],
u'price': [u'15,00'],
u'year': [u'1994']}],
u'Street Fighter': [{u'director': [u'E. de Souza'],
u'film': [u'Street Fighter'],
u'price': [u'2,00'],
u'year': [u'1994']}],
u'The Matrix': [{u'director': [u'Wachowski'],
u'film': [u'The Matrix'],
u'price': [u'19,00'],
u'year': [u'1999']},
{u'director': [u'Wachowski'],
u'film': [u'The Matrix'],
u'price': [u'20,00'],
u'year': [u'1999']}],
u'The Matrix Reloaded': [{u'director': [u'Wachowski'],
u'film': [u'The Matrix Reloaded'],
u'price': [u'9,99'],
u'year': [u'2003']}]}
使用SoundEx对电影进行分组非常简单:
from itertools import groupby, islice, ifilter
_codes = ('bfpv', 'cgjkqsxz', 'dt', 'l', 'mn', 'r')
_sounds = {c: str(i) for i, code in enumerate(_codes, 1) for c in code}
_sounds.update(dict.fromkeys('aeiouy'))
def soundex(word, _sounds=_sounds):
grouped = groupby(_sounds[c] for c in word.lower() if c in _sounds)
if _sounds.get(word[0].lower()):
next(grouped) # remove first group.
sdx = ''.join([k for k, g in islice((g for g in grouped if g[0]), 3)])
return word[0].upper() + format(sdx, '<03')
grouped_by_soundex = defaultdict(list)
for film in f:
grouped_by_soundex[soundex(film['film'][0])].append(film)
导致:
>>> pprint(dict(grouped_by_soundex))
{u'B436': [{u'director': [u'Ridley Scott'],
u'film': [u'Blade Runner'],
u'price': [u'19,99'],
u'year': [u'1982']}],
u'K414': [{u'director': [u'Tarantino'],
u'film': [u'Kill Bill vol.1'],
u'price': [u'10,00'],
u'year': [u'2003']}],
u'P412': [{u'director': [u'Tarantino'],
u'film': [u'Pulp Fiction'],
u'price': [u'20,00'],
u'year': [u'1994']},
{u'director': [u'Tarantino'],
u'film': [u'Pulp Fyction'],
u'price': [u'15,00'],
u'year': [u'1994']}],
u'S363': [{u'director': [u'E. de Souza'],
u'film': [u'Street Fighter'],
u'price': [u'2,00'],
u'year': [u'1994']}],
u'T536': [{u'director': [u'Wachowski'],
u'film': [u'The Matrix'],
u'price': [u'19,00'],
u'year': [u'1999']},
{u'director': [u'Wachowski'],
u'film': [u'The Matrix Reloaded'],
u'price': [u'9,99'],
u'year': [u'2003']},
{u'director': [u'Wachowski'],
u'film': [u'The Matrix'],
u'price': [u'20,00'],
u'year': [u'1999']}]}