python - 如何仅合并从 file_a 到 file_b 的唯一行？

Question

这个问题已经在这里以一种或另一种形式提出，但不是我正在寻找的东西。所以，这就是我将遇到的情况：我已经有一个文件，命名file_a并且我正在创建另一个文件 - file_b. file_a 的大小始终大于 file_b。file_b 中会有许多重复的行（因此，在 file_a 中也是如此），但两个文件都有一些独特的行。我想要做的是：仅复制/合并唯一行 from file_atofile_b然后对行顺序进行排序，以便 file_b 成为具有所有唯一条目的最新行。任何一个原始文件的大小都不应超过 10MB。我能做到这一点的最有效（和最快）的方法是什么？

我在想这样的事情，合并没问题。

#!/usr/bin/env python

import os, time, sys

# Convert Date/time to epoch
def toEpoch(dt):
    dt_ptrn = '%d/%m/%y %H:%M:%S'
    return int(time.mktime(time.strptime(dt, dt_ptrn)))

# input files
o_file = "file_a"
c_file = "file_b"
n_file = [o_file,c_file]

m_file = "merged.file"

for x in range(len(n_file)):
    P = open(n_file[x],"r")
    output = P.readlines()
    P.close()

    # Sort the output, order by 2nd last field
    #sp_lines = [ line.split('\t') for line in output ]
    #sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )

    F = open(m_file,'w') 
    #for line in sp_lines:
    for line in output:
        if "group_" in line:
            F.write(line)
    F.close()

但是，它是：

不仅有独特的线条
未排序（按最后一个字段）
并介绍了第三个文件，即m_file

只是一个旁注（长话短说）：不幸的是，我不能在这里使用 sorted()，因为我使用的是 v2.3。输入文件如下所示：

On 23/03/11 00:40:03
JobID   Group.User          Ctime   Wtime   Status  QDate               CDate
===================================================================================
430792  group_atlas.pltatl16    0   32  4   02/03/11 21:52:38   02/03/11 22:02:15
430793  group_atlas.atlas084    30  472 4   02/03/11 21:57:43   02/03/11 22:09:35
430794  group_atlas.atlas084    12  181 4   02/03/11 22:02:37   02/03/11 22:05:42
430796  group_atlas.atlas084    8   185 4   02/03/11 22:02:38   02/03/11 22:05:46

我尝试使用 cmp() 按倒数第二个字段进行排序，但我认为它不起作用只是因为输入文件的前 3 行。

有人可以帮忙吗？干杯！！！

更新1：

为了将来参考，正如 Jakob 所建议的，这里是完整的脚本。它工作得很好。

#!/usr/bin/env python

import os, time, sys
from sets import Set as set

def toEpoch(dt):
    dt_ptrn = '%d/%m/%y %H:%M:%S'
    return int(time.mktime(time.strptime(dt, dt_ptrn)))

def yield_lines(fileobj):
    #I want to discard the headers
    for i in xrange(3):
        fileobj.readline()
    #
    for line in fileobj:
        yield line

def app(path1, path2):
    file1 = set(yield_lines(open(path1)))
    file2 = set(yield_lines(open(path2)))
    return file1.union(file2)

# Input files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"

print time.strftime('%H:%M:%S', time.localtime())

# Sorting the output, order by 2nd last field
sp_lines = [ line.split('\t') for line in app(o_file, c_file) ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )

F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)

for line in sp_lines:

    MF = '\t'.join(line)
    F.write(MF)
F.close()

完成 145244 行大约需要 2m:47s。

[testac1@serv07 ~]$ ./uniq-merge.py 
17:19:21
No. of lines:  145244
17:22:08

谢谢！！

更新 2：

嗨 eyquem，这是我在运行脚本时收到的错误消息。

从第一个脚本：

[testac1@serv07 ~]$ ./uniq-merge_2.py 
  File "./uniq-merge_2.py", line 44
    fm.writelines( '\n'.join(v)+'\n' for k,v in output )
                                       ^
SyntaxError: invalid syntax

从第二个脚本：

[testac1@serv07 ~]$ ./uniq-merge_3.py 
  File "./uniq-merge_3.py", line 24
    output = sett(line.rstrip() for line in fa)
                                  ^
SyntaxError: invalid syntax

干杯！！

更新 3：

前一个根本没有对列表进行排序。感谢 eyquem 指出这一点。嗯，现在可以了。这是对 Jakob 版本的进一步修改 - 我将 set:app(path1, path2) 转换为 list:myList() ，然后将 sort( lambda ... ) 应用于myList通过嵌套对合并文件进行排序到最后场地。这是最终的脚本。

#!/usr/bin/env python

import os, time, sys
from sets import Set as set

def toEpoch(dt):
    # Convert date/time to epoch
    dt_ptrn = '%d/%m/%y %H:%M:%S'
    return int(time.mktime(time.strptime(dt, dt_ptrn)))

def yield_lines(fileobj):
    # Discard the headers (1st 3 lines)
    for i in xrange(3):
        fileobj.readline()

    for line in fileobj:
        yield line

def app(path1, path2):
    # Remove duplicate lines
    file1 = set(yield_lines(open(path1)))
    file2 = set(yield_lines(open(path2)))
    return file1.union(file2)

print time.strftime('%H:%M:%S', time.localtime())

# I/O files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"

# Convert set into to list
myList = list(app(o_file, c_file))

# Sort the list by the date
sp_lines = [ line.split('\t') for line in myList ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )

F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)

# Finally write to the outFile
for line in sp_lines:
    MF = '\t'.join(line)
    F.write(MF)
F.close()

根本没有速度提升，处理相同的 145244 行需要 2m:50s。有没有人看到任何改进的范围，请告诉我。感谢 Jakob 和 eyquem 的时间。干杯！！

更新 4：

仅供将来参考，这是eyguem的修改版本，它比以前的版本更好更快。

#!/usr/bin/env python

import os, sys, re
from sets import Set as sett
from time import mktime, strptime, strftime

def sorting_merge(o_file, c_file, m_file ):

    # RegEx for Date/time filed
    pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d')

    def kl(lines,pat = pat):
        # match only the next to last field
        line = lines.split('\t')
        line = line[-2]
        return mktime(strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))

    output = sett()
    head = []

    # Separate the header & remove the duplicates
    def rmHead(f_n):
        f_n.readline()
        for line1 in f_n:
            if pat.search(line1):  break
            else:  head.append(line1) # line of the header
        for line in f_n:
            output.add(line.rstrip())
        output.add(line1.rstrip())
        f_n.close()

    fa = open(o_file, 'r')
    rmHead(fa)

    fb = open(c_file, 'r')
    rmHead(fb)

    # Sorting date-wise
    output = [ (kl(line),line.rstrip()) for line in output if line.rstrip() ]
    output.sort()

    fm = open(m_file,'w')
    # Write to the file & add the header
    fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head[0]+head[1])))
    for t,line in output:
        fm.write(line + '\n')
    fm.close()


c_f = "03_a"
o_f = "03_b"

sorting_merge(o_f, c_f, 'outfile.txt')

这个版本要快得多 - 6.99 秒。对于 145244 行，与 2m:47s 相比 - 然后前一个使用lambda a, b: cmp(). 感谢 eyquem 的所有支持。干杯！！

score 2 · Accepted Answer

编辑 2

output = sett(line.rstrip() for line in fa)我以前的代码有问题 output.sort(key=kl)

此外，它们还有一些并发症。

set()因此，我检查了使用Jakob Bowyer 在其代码中采用的函数直接读取文件的选择。

祝贺雅各布！（顺便说一下 Michal Chruszcz）：set()无与伦比，它比一次读一行要快。

然后，我放弃了逐行读取文件的想法。

.

但是我保留了我的想法，以避免在cmp()函数的帮助下进行排序，因为正如文档中所描述的那样：

s.sort([cmpfunc=None])

sort() 方法接受一个可选参数，指定两个参数（列表项）的比较函数 (...) 请注意，这会大大减慢排序过程

http://docs.python.org/release/2.3/lib/typesseq-mutable.html

然后，我设法获得了一个元组列表(t,line)，其中t是

time.mktime(time.strptime(( 1st date-and-hour in line ,'%d/%m/%y %H:%M:%S'))

根据指示

output = [ (kl(line),line.rstrip()) for line in output]

.

我测试了2个代码。以下是通过正则表达式计算的第一个日期和时间：

def kl(line,pat = pat):
    return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))

output = [ (kl(line),line.rstrip()) for line in output if line.rstrip()]

output.sort()

第二个代码kl()是：

def kl(line,pat = pat):
    return time.mktime(time.strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S'))

.

结果是

执行次数：

使用正则表达式的第一个代码为 0.03598 秒

使用 split('\t') 的第二个代码为 0.03580 秒

也就是说一样

该算法比使用函数的代码更快cmp()：

一个代码，其中一组行输出未在元组列表中转换为

output = [ (kl(line),line.rstrip()) for line in output]

但仅在行列表中进行转换（然后不重复）并使用函数mycmp()进行排序（请参阅文档）：

def mycmp(a,b):
    return cmp(time.mktime(time.strptime(a.split('\t')[-2],'%d/%m/%y %H:%M:%S')),
               time.mktime(time.strptime(b.split('\t')[-2],'%d/%m/%y %H:%M:%S')))

output = [ line.rstrip() for line in output] # not list(output) , to avoid the problem of newline of the last line of each file
output.sort(mycmp)

for line in output:
    fm.write(line+'\n')

执行时间为

0.11574 秒

.

编码：

#!/usr/bin/env python

import os, time, sys, re
from sets import Set as sett

def sorting_merge(o_file , c_file, m_file ):

    pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                     '(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)') 

    def kl(line,pat = pat):
        return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))

    output = sett()
    head = []

    fa = open(o_file)
    fa.readline() # first line is skipped
    while True:
        line1 = fa.readline()
        mat1  = pat.search(line1)
        if not mat1: head.append(line1) # line1 is here a line of the header
        else: break # the loop ends on the first line1 not being a line of the heading
    output = sett( fa )
    fa.close()

    fb = open(c_file)
    while True:
        line1 = fb.readline()
        if pat.search(line1):  break
    output = output.union(sett( fb ))
    fb.close()

    output = [ (kl(line),line.rstrip()) for line in output]
    output.sort()

    fm = open(m_file,'w')
    fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
    for t,line in output:
        fm.write(line + '\n')
    fm.close()


te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te

这一次，我希望它能正确运行，唯一要做的就是等待实际文件的执行时间比我测试代码的文件大得多

.

编辑 3

pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                 '(?=[ \t]+'
                 '[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                 '|'
                 '[ \t]+aborted/deleted)')

.

编辑 4

#!/usr/bin/env python

import os, time, sys, re
from sets import Set

def sorting_merge(o_file , c_file, m_file ):

    pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                     '(?=[ \t]+'
                     '[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                     '|'
                     '[ \t]+aborted/deleted)')

    def kl(line,pat = pat):
        return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))

    head = []
    output = Set()

    fa = open(o_file)
    fa.readline() # first line is skipped
    for line1 in fa:
        if pat.search(line1):  break # first line after the heading
        else:  head.append(line1) # line of the header
    for line in fa:
        output.add(line.rstrip())
    output.add(line1.rstrip())
    fa.close()

    fb = open(c_file)
    for line1 in fb:
        if pat.search(line1):  break
    for line in fb:
        output.add(line.rstrip())
    output.add(line1.rstrip())
    fb.close()

    if '' in output:  output.remove('')
    output = [ (kl(line),line) for line in output]
    output.sort()

    fm = open(m_file,'w')
    fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
    for t,line in output:
        fm.write(line+'\n')
    fm.close()

te = time.clock()
sorting_merge('A.txt','B.txt','C.txt')
print time.clock()-te

score 2 · Accepted Answer

也许沿着这些思路？

from sets import Set as set

def yield_lines(fileobj):
    #I want to discard the headers
    for i in xrange(3):
        fileobj.readline()

    for line in fileobj:
        yield line

def app(path1, path2):
    file1 = set(yield_lines(open(path1)))
    file2 = set(yield_lines(open(path2)))

    return file1.union(file2)

编辑：忘记了：$

score 0 · Accepted Answer

我编写了这个新代码，很容易使用集合。它比我以前的代码更快。而且，看起来，比你的代码

#!/usr/bin/env python

import os, time, sys, re
from sets import Set as sett

def sorting_merge(o_file , c_file, m_file ):

    # Convert Date/time to epoch
    def toEpoch(dt):
        dt_ptrn = '%d/%m/%y %H:%M:%S'
        return int(time.mktime(time.strptime(dt, dt_ptrn)))

    pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)'
                     '[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d') 

    fa = open(o_file)
    head = []
    fa.readline()
    while True:
        line1 = fa.readline()
        mat1 = pat.search(line1)
        if not mat1:
            head.append(('',line1.rstrip()))
        else:
            break
    output = sett((toEpoch(pat.search(line).group(1)) , line.rstrip())
                 for line in fa)
    output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
    fa.close()


    fb = open(c_file)
    while True:
        line1 = fb.readline()
        mat1 = pat.search(line1)
        if mat1:  break
    for line in fb:
        output.add((toEpoch(pat.search(line).group(1)) , line.rstrip()))
    output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
    fb.close()

    output = list(output)
    output.sort()
    output[0:0] = head
    output[0:0] = [('',time.strftime('On %d/%m/%y %H:%M:%S'))]

    fm = open(m_file,'w')
    fm.writelines( line+'\n' for t,line in output)
    fm.close()



te = time.clock()
sorting_merge('ytr.txt','tatay.txt','merged.file.txt')
print time.clock()-te

请注意，此代码在合并文件中放置了一个标题

.

编辑

啊啊啊……我明白了……:-)）

执行时间除以 3 ！

#!/usr/bin/env python

import os, time, sys, re
from sets import Set as sett

def sorting_merge(o_file , c_file, m_file ):

    pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
                     '(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)') 

    def kl(line,pat = pat):
        return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))

    fa = open(o_file)
    head = []
    fa.readline()
    while True:
        line1 = fa.readline()
        mat1 = pat.search(line1)
        if not mat1:
            head.append(line1.rstrip())
        else:
            break
    output = sett(line.rstrip() for line in fa)
    output.add(line1.rstrip())
    fa.close()

    fb = open(c_file)
    while True:
        line1 = fb.readline()
        mat1 = pat.search(line1)
        if mat1:  break
    for line in fb:
        output.add(line.rstrip())
    output.add(line1.rstrip())
    fb.close()

    output = list(output)
    output.sort(key=kl)
    output[0:0] = [time.strftime('On %d/%m/%y %H:%M:%S')] + head

    fm = open(m_file,'w')
    fm.writelines( line+'\n' for line in output)
    fm.close()

te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te

score 0 · Accepted Answer

最后的代码，我希望。

因为我发现了一个杀手代码。

首先，我创建了两个文件“xxA.txt”和“yyB.txt”，共 30 行，有 30000 行作为

430559  group_atlas.atlas084    12  181 4       04/03/10 01:38:02   02/03/11 22:05:42
430502  group_atlas.atlas084    12  181 4       23/01/10 21:45:05   02/03/11 22:05:42
430544  group_atlas.atlas084    12  181 4       17/06/11 12:58:10   02/03/11 22:05:42
430566  group_atlas.atlas084    12  181 4       25/03/10 23:55:22   02/03/11 22:05:42

使用以下代码：

创建 AB.py

from random import choice

n = tuple( str(x) for x in xrange(500,600))
days = ('01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16',
        '17','18','19','20','21','22','23','24','25','26','27','28')
# not '29','30,'31' to avoid problems with strptime() on last days of february
months = days[0:12]
hours = days[0:23]
ms = ['00','01','02','03','04','05','06','07','09'] + [str(x) for x in xrange(10,60)]

repeat = 30000

with open('xxA.txt','w') as f:
    # 430794  group_atlas.atlas084    12  181 4     02/03/11 22:02:37   02/03/11 22:05:42
    ch = ('On 23/03/11 00:40:03\n'
          'JobID   Group.User          Ctime   Wtime   Status  QDate               CDate\n'
          '===================================================================================\n')
    f.write(ch)
    for i in xrange(repeat):
        line  = '430%s  group_atlas.atlas084    12  181 4   \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
                (choice(n),
                 choice(days),choice(months),choice(('10','11')),
                 choice(hours),choice(ms),choice(ms))
        f.write(line)


with open('yyB.txt','w') as f:
    # 430794  group_atlas.atlas084    12  181 4     02/03/11 22:02:37   02/03/11 22:05:42
    ch = ('On 25/03/11 13:45:24\n'
          'JobID   Group.User          Ctime   Wtime   Status  QDate               CDate\n'
          '===================================================================================\n')
    f.write(ch)
    for i in xrange(repeat):
        line  = '430%s  group_atlas.atlas084    12  181 4   \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
                (choice(n),
                 choice(days),choice(months),choice(('10','11')),
                 choice(hours),choice(ms),choice(ms))
        f.write(line)

with open('xxA.txt') as g:
    print 'readlines of xxA.txt :',len(g.readlines())
    g.seek(0,0)
    print 'set of xxA.txt :',len(set(g))

with open('yyB.txt') as g:
    print 'readlines of yyB.txt :',len(g.readlines())
    g.seek(0,0)
    print 'set of yyB.txt :',len(set(g))

然后我运行了这 3 个程序：

“合并正则表达式.py”

#!/usr/bin/env python

from time import clock,mktime,strptime,strftime
from sets import Set
import re

infunc = []

def sorting_merge(o_file, c_file, m_file ):
    infunc.append(clock()) #infunc[0]
    pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
    output = Set()

    def rmHead(filename, a_set):
        f_n = open(filename, 'r')
        f_n.readline()
        head = []
        for line in f_n:
            head.append(line) # line of the header
            if line.strip('= \r\n')=='':  break
        for line in f_n:
            a_set.add(line.rstrip())
        f_n.close()
        return head

    infunc.append(clock()) #infunc[1]
    head = rmHead(o_file, output)
    infunc.append(clock()) #infunc[2]
    head = rmHead(c_file, output)
    infunc.append(clock()) #infunc[3]
    if '' in output:  output.remove('')

    infunc.append(clock()) #infunc[4]
    output = [ (mktime(strptime(pat.search(line).group(),'%d/%m/%y %H:%M:%S')),line)
               for line in output ]
    infunc.append(clock()) #infunc[5]
    output.sort()
    infunc.append(clock()) #infunc[6]

    fm = open(m_file,'w')
    fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
    for t,line in output:
        fm.write(line + '\n')
    fm.close()
    infunc.append(clock()) #infunc[7]



c_f = "xxA.txt"
o_f = "yyB.txt"

t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedr.txt')
t2 = clock()
print 'merging regex'
print 'total time of execution :',t2-t1
print '              launching :',infunc[1] - t1
print '            preparation :',infunc[1] - infunc[0]
print '    reading of 1st file :',infunc[2] - infunc[1]
print '    reading of 2nd file :',infunc[3] - infunc[2]
print '      output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print '      sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]

“合并 split.py”

#!/usr/bin/env python

from time import clock,mktime,strptime,strftime
from sets import Set

infunc = []

def sorting_merge(o_file, c_file, m_file ):
    infunc.append(clock()) #infunc[0]
    output = Set()

    def rmHead(filename, a_set):
        f_n = open(filename, 'r')
        f_n.readline()
        head = []
        for line in f_n:
            head.append(line) # line of the header
            if line.strip('= \r\n')=='':  break
        for line in f_n:
            a_set.add(line.rstrip())
        f_n.close()
        return head

    infunc.append(clock()) #infunc[1]
    head = rmHead(o_file, output)
    infunc.append(clock()) #infunc[2]
    head = rmHead(c_file, output)
    infunc.append(clock()) #infunc[3]
    if '' in output:  output.remove('')

    infunc.append(clock()) #infunc[4]
    output = [ (mktime(strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S')),line)
               for line in output ]
    infunc.append(clock()) #infunc[5]
    output.sort()
    infunc.append(clock()) #infunc[6]

    fm = open(m_file,'w')
    fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
    for t,line in output:
        fm.write(line + '\n')
    fm.close()
    infunc.append(clock()) #infunc[7]



c_f = "xxA.txt"
o_f = "yyB.txt"

t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergeds.txt')
t2 = clock()
print 'merging split'
print 'total time of execution :',t2-t1
print '              launching :',infunc[1] - t1
print '            preparation :',infunc[1] - infunc[0]
print '    reading of 1st file :',infunc[2] - infunc[1]
print '    reading of 2nd file :',infunc[3] - infunc[2]
print '      output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print '      sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]

“合并杀手”

#!/usr/bin/env python

from time import clock,strftime
from sets import Set
import re

infunc = []

def sorting_merge(o_file, c_file, m_file ):
    infunc.append(clock()) #infunc[0]
    patk = re.compile('([0123]\d)/([01]\d)/(\d{2}) ([012]\d:[0-6]\d:[0-6]\d)')
    output = Set()

    def rmHead(filename, a_set):
        f_n = open(filename, 'r')
        f_n.readline()
        head = []
        for line in f_n:
            head.append(line) # line of the header
            if line.strip('= \r\n')=='':  break
        for line in f_n:
            a_set.add(line.rstrip())
        f_n.close()
        return head

    infunc.append(clock()) #infunc[1]
    head = rmHead(o_file, output)
    infunc.append(clock()) #infunc[2]
    head = rmHead(c_file, output)
    infunc.append(clock()) #infunc[3]
    if '' in output:  output.remove('')

    infunc.append(clock()) #infunc[4]
    output = [ (patk.search(line).group(3,2,1,4),line)for line in output ]
    infunc.append(clock()) #infunc[5]
    output.sort()
    infunc.append(clock()) #infunc[6]

    fm = open(m_file,'w')
    fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
    for t,line in output:
        fm.write(line + '\n')
    fm.close()
    infunc.append(clock()) #infunc[7]



c_f = "xxA.txt"
o_f = "yyB.txt"

t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedk.txt')
t2 = clock()
print 'merging killer'
print 'total time of execution :',t2-t1
print '              launching :',infunc[1] - t1
print '            preparation :',infunc[1] - infunc[0]
print '    reading of 1st file :',infunc[2] - infunc[1]
print '    reading of 2nd file :',infunc[3] - infunc[2]
print '      output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print '      sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]

结果

merging regex
total time of execution : 14.2816595405
              launching : 0.00169211450059
            preparation : 0.00168093989599
    reading of 1st file : 0.163582242995
    reading of 2nd file : 0.141301478261
      output.remove('') : 2.37460347614e-05
     creation of output : 13.4460212122
      sorting of output : 0.216363532237
writing of merging file : 0.232923737514
closing of the function : 0.0797514767938

merging split
total time of execution : 13.7824474898
              launching : 4.10666718815e-05
            preparation : 2.70984161395e-05
    reading of 1st file : 0.154349784679
    reading of 2nd file : 0.136050810927
      output.remove('') : 2.06730184981e-05
     creation of output : 12.9691854691
      sorting of output : 0.218704332534
writing of merging file : 0.225259076223
closing of the function : 0.0788362766776

merging killer
total time of execution : 2.14315311024
              launching : 0.00206199391263
            preparation : 0.00205026057781
    reading of 1st file : 0.158711791582
    reading of 2nd file : 0.138976601775
      output.remove('') : 2.37460347614e-05
     creation of output : 0.621466415424
      sorting of output : 0.823161602941
writing of merging file : 0.227701565422
closing of the function : 0.171049393149

在杀手程序期间，对输出进行排序需要 4 倍的时间，但是将输出创建为列表的时间除以 21！然后在全球范围内，执行时间至少减少了 85%。

python - 如何仅合并从 file_a 到 file_b 的唯一行？

4 回答 4

创建 AB.py

“合并正则表达式.py”

“合并 split.py”

“合并杀手”

Related

Reference