我的目标是优化基于 CSV 来源列表的修饰符堆栈的框架。每个修饰符都使用一个标题列表在命名的基础上工作。
CSV 示例(包括标题):
date;place
13/02/2013;New York
15/04/2012;Buenos Aires
29/10/2010;Singapour
我已经编写了一些基于 namedtuple 的代码,以便能够使用 csv 模块生成的列表,而无需每次都重新组织数据。生成的代码如下:
class MyNamedList(object):
__slots__ = ("__values")
_fields = ['date', 'ignore', 'place']
def __init__(self, values):
self.__values = values
if len(self.__values) <= 151:
for i in range(len(self.__values), 151):
self.__values += [None,]
@property
def date(self):
return self.__values[0]
@date.setter
def date(self, val):
self.__values[0] = val
@property
def ignore(self):
return self.__values[150]
@ignore.setter
def ignore(self, val):
self.__values[150] = val
@property
def place(self):
return self.__values[1]
@b.setter
def place(self, val):
self.__values[1] = val
我必须说我对使用这个类的表现非常失望。为 70000 行 csv 文件的每一行调用一个简单的修饰函数(将“忽略”更改为 True 100 次。是的,我知道它是无用的)需要 9 秒(使用 pypy.5.5 使用原始 python),而等效代码使用名为 foo 的列表需要 1.1 秒(与 pypy 和原始 python 相同)。
我能做些什么来获得两种方法之间的可比性能吗?对我来说,record.ignore = True
可以直接内联(左右),因此翻译成record[150] = True
. 有没有我看不到的阻碍点来实现这一点?
请注意,我正在修改的记录实际上(目前)不是为 CSV 文件中的每一行创建的,这意味着在列表中添加更多项目只发生一次,在迭代之前。
更新:示例代码
--> 使用命名列表
import namedlist
MyNamedList=namedlist.namedlist("MyNamedList", {"a":1, "b":2, "ignore":150})
test = MyNamedList([0,1])
def foo(a):
test.ignore = True # x100 times
import csv
stream = csv.reader(open("66666.csv", "rb"))
for i in stream:
foo(i)
--> 不使用命名列表
import namedlist
import csv
MyNamedList=namedlist.namedlist("MyNamedList", {"a":1, "b":2, "ignore":150})
test = MyNamedList([0,1])
sample_data = []
for i in range(len(sample_data), 151):
sample_data += [None,]
def foo(a):
sample_data[150] = True # x100 times
stream = csv.reader(open("66666.csv", "rb"))
for i in stream:
foo(i)
更新 #2:namedlist.py 的代码(主要基于 namedtuple.py
# Retrieved from http://code.activestate.com/recipes/500261/
# Licensed under the PSF license
from keyword import iskeyword as _iskeyword
import sys as _sys
def namedlist(typename, field_indices, verbose=False, rename=False):
# Parse and validate the field names. Validation serves two purposes,
# generating informative error messages and preventing template injection attacks.
field_names = field_indices.keys()
for name in [typename,] + field_names:
if not min(c.isalnum() or c=='_' for c in name):
raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
if _iskeyword(name):
raise ValueError('Type names and field names cannot be a keyword: %r' % name)
if name[0].isdigit():
raise ValueError('Type names and field names cannot start with a number: %r' % name)
seen_names = set()
for name in field_names:
if name.startswith('_') and not rename:
raise ValueError('Field names cannot start with an underscore: %r' % name)
if name in seen_names:
raise ValueError('Encountered duplicate field name: %r' % name)
seen_names.add(name)
# Create and fill-in the class template
numfields = len(field_names)
argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes
reprtxt = ', '.join('%s=%%r' % name for name in field_names)
max_index=-1
for name in field_names:
index = field_indices[name]
if max_index < index:
max_index = index
max_index += 1
template = '''class %(typename)s(object):
__slots__ = ("__values") \n
_fields = %(field_names)r \n
def __init__(self, values):
self.__values = values
if len(self.__values) <= %(max_index)s:
for i in range(len(self.__values), %(max_index)s):
self.__values += [None,]'''% locals()
for name in field_names:
index = field_indices[name]
template += ''' \n
@property
def %s(self):
return self.__values[%d]
@%s.setter
def %s(self, val):
self.__values[%d] = val''' % (name, index, name, name, index)
if verbose:
print template
# Execute the template string in a temporary namespace
namespace = {'__name__':'namedtuple_%s' % typename,
'_property':property, '_tuple':tuple}
try:
exec template in namespace
except SyntaxError, e:
raise SyntaxError(e.message + ':\n' + template)
result = namespace[typename]
# For pickling to work, the __module__ variable needs to be set to the frame
# where the named tuple is created. Bypass this step in enviroments where
# sys._getframe is not defined (Jython for example) or sys._getframe is not
# defined for arguments greater than 0 (IronPython).
try:
result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
except (AttributeError, ValueError):
pass
return result