Based on Sorting a million 32-bit integers in 2MB of RAM using Python:
import sys
from functools import partial
from heapq import merge
from tempfile import TemporaryFile
# define sorting criteria
def second_column(line, default=float("inf")):
try:
return int(line.split(";", 2)[1]) # use int() for numeric sort
except (IndexError, ValueError):
return default # a key for non-integer or non-existent 2nd column
# sort lines in small batches, write intermediate results to temporary files
sorted_files = []
nbytes = 1 << 20 # load around nbytes bytes at a time
for lines in iter(partial(sys.stdin.readlines, nbytes), []):
lines.sort(key=second_column) # sort current batch
f = TemporaryFile("w+")
f.writelines(lines)
f.seek(0) # rewind
sorted_files.append(f)
# merge & write the result
sys.stdout.writelines(merge(*sorted_files, key=second_column))
# clean up
for f in sorted_files:
f.close() # temporary file is deleted when it closes
heapq.merge()
has key
parameter since Python 3.5. You could try mergeiter()
from Martijn Pieters' answer instead or do Schwartzian transform on older Python versions:
iters = [((second_column(line), line) for line in file)
for file in sorted_files] # note: this makes the sort unstable
sorted_lines = (line for _, line in merge(*iters))
sys.stdout.writelines(sorted_lines)
Usage:
$ python sort-k2-n.py < input.txt > output.txt