这是在我的系统上运行的代码,代码运行良好,数据量很少,但相同的代码不适用于大量数据集或文本文件,是的@ennikiller 是对的,有些文件是空的(意味着它们没有Esgn
值,但它们有其他值),我想将其处理为,它应该Esgn
为所有其他值分配零或仅计算最小值并离开Esgn
(在该特定文件中不包含Esgn
全部)
import os.path
import glob
import re
import itertools
from collections import namedtuple, deque
from operator import attrgetter
R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')
getvalue = attrgetter('value')
def interleave(seq, val):
return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))
class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
@classmethod
def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
m = rematch(s)
if not m:
raise ValueError('No valid line found in %r' % s)
d = m.groupdict()
d['value'] = int(d['value'])
d['filename'] = filename
return cls(**d)
def _asstr(self):
return '{}{} {}'.format(self.prefix, self.suffix, self.value)
def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
withprefix = (line for line in lineseq if line.prefix==prefix)
return max_value(withprefix)
def filter_lt_line(lineseq, maxline):
for line in lineseq:
if line.prefix != maxline.prefix or line.value >= maxline.value:
yield line
def extreme_value(fn, lineseq, getvalue=getvalue):
try:
return fn((l for l in lineseq if l is not None), key=getvalue)
except ValueError:
return None
def max_value(lineseq):
return extreme_value(max, lineseq)
def min_value(lineseq):
return extreme_value(min, lineseq)
def read_lines(fn, maker=Fileline._fromstr):
with open(fn, 'rb') as f:
return deque(maker(l, fn) for l in f)
def write_file(fn, lineseq):
lines = (l._asstr() for l in lineseq)
newlines = interleave(lines, '\n')
with open(fn, 'wb') as f:
f.writelines(newlines)
def write_output_file(fn, lineseq):
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
newlines = interleave(lines, "\n")
with open(fn, 'wb') as f:
f.writelines(newlines)
def filter_max_returning_min(fn, prefix):
lineseq = read_lines(fn)
maxvalue = max_value_with_prefix(lineseq, prefix)
filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
write_file(fn, filteredlineseq)
minline = min_value(filteredlineseq)
return minline
def main(fileglob, prefix, outputfile):
minline = None
for fn in glob.iglob(fileglob):
fileminline = filter_max_returning_min(fn, prefix)
minline = min_value([minline, fileminline])
write_output_file(outputfile, [minline])
def _worker(args):
return filter_max_returning_min(*args)
"""def multi_main(fileglob, prefix, outputfile, processes):
from multiprocessing import Pool
pool = Pool(processes=processes)
workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
minlines = pool.imap_unordered(_worker, workerargs, processes)
minline = min_value(minlines)
write_file(outputfile, [minline])"""
def main(fileglob, prefix, outputfile):
minlines = []
for fn in glob.iglob(fileglob):
minlines.append(filter_max_returning_min(fn, prefix))
write_output_file(outputfile, minlines)
main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt')
这是我的代码,这个代码在 10/20 文本文件上运行良好,但是当我运行它 3000 txt 文件时,它给了我以下错误
Traceback (most recent call last):
File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 95, in <module>
main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt')
File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 93, in main
minlines.append(filter_max_returning_min(fn, prefix))
File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 69, in filter_max_returning_min
filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 35, in filter_lt_line
if line.prefix != maxline.prefix or line.value >= maxline.value:
AttributeError: 'NoneType' object has no attribute 'prefix'
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
AttributeError: 'NoneType' object has no attribute 'filename'