1

这是在我的系统上运行的代码,代码运行良好,数据量很少,但相同的代码不适用于大量数据集或文本文件,是的@ennikiller 是对的,有些文件是空的(意味着它们没有Esgn值,但它们有其他值),我想将其处理为,它应该Esgn为所有其他值分配零或仅计算最小值并离开Esgn(在该特定文件中不包含Esgn全部)

    import os.path
    import glob
    import re
    import itertools
    from collections import namedtuple, deque
    from operator import attrgetter

    R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')

    getvalue  = attrgetter('value')

    def interleave(seq, val):
        return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))

    class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
        @classmethod
        def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
            m = rematch(s)
            if not m:
                raise ValueError('No valid line found in %r' % s)
            d = m.groupdict()
            d['value'] = int(d['value'])
            d['filename'] = filename
            return cls(**d)

        def _asstr(self):
            return '{}{} {}'.format(self.prefix, self.suffix, self.value)

    def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
        withprefix = (line for line in lineseq if line.prefix==prefix)
        return max_value(withprefix)

    def filter_lt_line(lineseq, maxline):
        for line in lineseq:
            if line.prefix != maxline.prefix or line.value >= maxline.value:
                yield line

    def extreme_value(fn, lineseq, getvalue=getvalue):
        try:
            return fn((l for l in lineseq if l is not None), key=getvalue)
        except ValueError:
            return None

    def max_value(lineseq):
        return extreme_value(max, lineseq)

    def min_value(lineseq):
        return extreme_value(min, lineseq)

    def read_lines(fn, maker=Fileline._fromstr):
        with open(fn, 'rb') as f:
            return deque(maker(l, fn) for l in f)

    def write_file(fn, lineseq):
        lines = (l._asstr() for l in lineseq)
        newlines = interleave(lines, '\n')
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def write_output_file(fn, lineseq):
        lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
        newlines = interleave(lines, "\n")
        with open(fn, 'wb') as f:
            f.writelines(newlines)

    def filter_max_returning_min(fn, prefix):
        lineseq = read_lines(fn)
        maxvalue = max_value_with_prefix(lineseq, prefix)
        filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
        write_file(fn, filteredlineseq)
        minline = min_value(filteredlineseq)
        return minline

    def main(fileglob, prefix, outputfile):
        minline = None
        for fn in glob.iglob(fileglob):
            fileminline = filter_max_returning_min(fn, prefix)
            minline = min_value([minline, fileminline])
        write_output_file(outputfile, [minline])
    def _worker(args):
        return filter_max_returning_min(*args)

    """def multi_main(fileglob, prefix, outputfile, processes):
        from multiprocessing import Pool
        pool = Pool(processes=processes)
        workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
        minlines = pool.imap_unordered(_worker, workerargs, processes)
        minline = min_value(minlines)
        write_file(outputfile, [minline])"""
    def main(fileglob, prefix, outputfile):
        minlines = []
        for fn in glob.iglob(fileglob):
            minlines.append(filter_max_returning_min(fn, prefix))
        write_output_file(outputfile, minlines)
    main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt') 

这是我的代码,这个代码在 10/20 文本文件上运行良好,但是当我运行它 3000 txt 文件时,它给了我以下错误

    Traceback (most recent call last):
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 95, in <module>
        main('C:\Python27\DataSet\*.txt', 'ENSG', 'output.txt')
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 93, in main
        minlines.append(filter_max_returning_min(fn, prefix))
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 69, in filter_max_returning_min
        filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
      File "C:\Users\Ir\Desktop\S_Project\Finding_min_val_and_escaping_'Esgn'\dataset_code.py", line 35, in filter_lt_line
        if line.prefix != maxline.prefix or line.value >= maxline.value:
    AttributeError: 'NoneType' object has no attribute 'prefix'
lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
AttributeError: 'NoneType' object has no attribute 'filename'
4

1 回答 1

0

尝试打印出 line 和 maxline 的值

显然,其中一个或另一个被设置为无。很可能是maxline,如果事实确实如此。所以找出你为什么为 maxline 传递 None 。所以你真的应该看这里:

try:
    lineseq = read_lines(fn)
    maxvalue = max_value_with_prefix(lineseq, prefix)
    filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
    write_file(fn, filteredlineseq)
    minline = min_value(filteredlineseq)
except:
    print "%s is apparently empty" % fn
    minline = None
finally:
    return milline

fn 有可能是一个空文件吗?

正如 Joran 所说,您只需要添加一些异常处理(我已在上面添加)

于 2013-04-09T19:52:00.340 回答