假设 [SEQUENCE ID] 确实适合内存,并且您的大部分数据实际上在序列行上(与提供的示例不同)-您可以选择解析文件(您的问题中的 file2),并且注释不仅 te [SEQUENCE ID] - 但每个此类标识符的文件位置。这种方法可以让您在不中断当前工作流程的情况下继续进行(例如,必须了解数据库):
def get_indexes(filename):
with open(filename, "rt") as file:
sequences = {}
while True:
position = file.tell()
id = file.readline()
if not id:
break()
sequences[id.strip()] = position
# skip corresponding data line:
file.readline()
return sequences
def fetcher(filename1, filename2, sequences):
with open(filename1, "rt") as file1, open(filename2, "rt" as file2):
while True:
id = file.readline()
data = file.readline()
if not id:
break
id = id.strip()
if id in sequences:
# postion file2 reading at the identifier:
file2.seek(sequences[id])
# throw away id:
file2.readline()
data = file.readline()
yield id, data
if __name__== "__main__":
sequences = getindexes("/data/file2")
for id, data in fetcher("/data/file1", "/data/file2", sequences):
print "%s\n%s"% (id, data)