python - 使用 Python 按行号将大文本文件拆分为较小的文本文件

Question

我有一个文本文件，说真的是_big_file.txt，其中包含：

line 1
line 2
line 3
line 4
...
line 99999
line 100000

我想编写一个 Python 脚本，将 real_big_file.txt 分成更小的文件，每个文件有 300 行。例如，small_file_300.txt 包含第 1-300 行，small_file_600 包含第 301-600 行，依此类推，直到有足够的小文件包含大文件中的所有行。

我将不胜感激有关使用 Python 完成此任务的最简单方法的任何建议

score 52 · Accepted Answer

lines_per_file = 300
smallfile = None
with open('really_big_file.txt') as bigfile:
    for lineno, line in enumerate(bigfile):
        if lineno % lines_per_file == 0:
            if smallfile:
                smallfile.close()
            small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
            smallfile = open(small_filename, "w")
        smallfile.write(line)
    if smallfile:
        smallfile.close()

score 35 · Accepted Answer

使用itertools石斑鱼食谱：

from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

n = 300

with open('really_big_file.txt') as f:
    for i, g in enumerate(grouper(n, f, fillvalue=''), 1):
        with open('small_file_{0}'.format(i * n), 'w') as fout:
            fout.writelines(g)

与将每一行存储在列表中相比，此方法的优势在于它可以逐行处理可迭代对象，因此不必small_file一次将每一行存储到内存中。

请注意，在这种情况下，最后一个文件将是small_file_100200但只会持续到line 100000. 发生这种情况是因为fillvalue=''，这意味着当我没有更多的行要写时，我什么也不写到文件中，因为组大小不均等。您可以通过写入临时文件然后重命名它来解决这个问题，而不是像我一样先命名它。这是如何做到的。

import os, tempfile

with open('really_big_file.txt') as f:
    for i, g in enumerate(grouper(n, f, fillvalue=None)):
        with tempfile.NamedTemporaryFile('w', delete=False) as fout:
            for j, line in enumerate(g, 1): # count number of lines in group
                if line is None:
                    j -= 1 # don't count this line
                    break
                fout.write(line)
        os.rename(fout.name, 'small_file_{0}.txt'.format(i * n + j))

这一次fillvalue=None，我检查每一行None，当它发生时，我知道该过程已经完成，所以我减去1不j计算填充物，然后写入文件。

score 4 · Accepted Answer

我这样做是一种更容易理解的方式，并且使用更少的捷径，以便让您进一步了解它的工作原理和原因。以前的答案有效，但如果您不熟悉某些内置函数，您将无法理解该函数在做什么。

因为您没有发布任何代码，所以我决定这样做，因为您可能不熟悉基本 Python 语法以外的其他内容，因为您提出问题的方式使您看起来好像没有尝试过，也不知道如何处理问题

以下是在基本 python 中执行此操作的步骤：

首先，您应该将您的文件读入一个列表以便妥善保管：

my_file = 'really_big_file.txt'
hold_lines = []
with open(my_file,'r') as text_file:
    for row in text_file:
        hold_lines.append(row)

其次，您需要设置一种按名称创建新文件的方法！我建议一个循环和几个计数器：

outer_count = 1
line_count = 0
sorting = True
while sorting:
    count = 0
    increment = (outer_count-1) * 300
    left = len(hold_lines) - increment
    file_name = "small_file_" + str(outer_count * 300) + ".txt"

第三，在该循环中，您需要一些嵌套循环，将正确的行保存到数组中：

hold_new_lines = []
    if left < 300:
        while count < left:
            hold_new_lines.append(hold_lines[line_count])
            count += 1
            line_count += 1
        sorting = False
    else:
        while count < 300:
            hold_new_lines.append(hold_lines[line_count])
            count += 1
            line_count += 1

最后一件事，再次在您的第一个循环中，您需要编写新文件并添加最后一个计数器增量，以便您的循环将再次通过并写入一个新文件

outer_count += 1
with open(file_name,'w') as next_file:
    for row in hold_new_lines:
        next_file.write(row)

注意：如果行数不能被 300 整除，则最后一个文件的名称将与最后一个文件行不对应。

了解这些循环为何起作用很重要。您已将其设置为在下一个循环中，您编写的文件的名称会发生变化，因为您的名称取决于不断变化的变量。这是一个非常有用的脚本工具，用于文件访问、打开、写入、组织等。

如果您无法遵循循环中的内容，这里是整个函数：

my_file = 'really_big_file.txt'
sorting = True
hold_lines = []
with open(my_file,'r') as text_file:
    for row in text_file:
        hold_lines.append(row)
outer_count = 1
line_count = 0
while sorting:
    count = 0
    increment = (outer_count-1) * 300
    left = len(hold_lines) - increment
    file_name = "small_file_" + str(outer_count * 300) + ".txt"
    hold_new_lines = []
    if left < 300:
        while count < left:
            hold_new_lines.append(hold_lines[line_count])
            count += 1
            line_count += 1
        sorting = False
    else:
        while count < 300:
            hold_new_lines.append(hold_lines[line_count])
            count += 1
            line_count += 1
    outer_count += 1
    with open(file_name,'w') as next_file:
        for row in hold_new_lines:
            next_file.write(row)

score 2 · Accepted Answer

lines_per_file = 300  # Lines on each small file
lines = []  # Stores lines not yet written on a small file
lines_counter = 0  # Same as len(lines)
created_files = 0  # Counting how many small files have been created

with open('really_big_file.txt') as big_file:
    for line in big_file:  # Go throught the whole big file
        lines.append(line)
        lines_counter += 1
        if lines_counter == lines_per_file:
            idx = lines_per_file * (created_files + 1)
            with open('small_file_%s.txt' % idx, 'w') as small_file:
                # Write all lines on small file
                small_file.write('\n'.join(stored_lines))
            lines = []  # Reset variables
            lines_counter = 0
            created_files += 1  # One more small file has been created
    # After for-loop has finished
    if lines_counter:  # There are still some lines not written on a file?
        idx = lines_per_file * (created_files + 1)
        with open('small_file_%s.txt' % idx, 'w') as small_file:
            # Write them on a last small file
            small_file.write('n'.join(stored_lines))
        created_files += 1

print '%s small files (with %s lines each) were created.' % (created_files,
                                                             lines_per_file)

score 2 · Accepted Answer

import csv
import os
import re

MAX_CHUNKS = 300


def writeRow(idr, row):
    with open("file_%d.csv" % idr, 'ab') as file:
        writer = csv.writer(file, delimiter=',', quotechar='\"', quoting=csv.QUOTE_ALL)
        writer.writerow(row)

def cleanup():
    for f in os.listdir("."):
        if re.search("file_.*", f):
            os.remove(os.path.join(".", f))

def main():
    cleanup()
    with open("large_file.csv", 'rb') as results:
        r = csv.reader(results, delimiter=',', quotechar='\"')
        idr = 1
        for i, x in enumerate(r):
            temp = i + 1
            if not (temp % (MAX_CHUNKS + 1)):
                idr += 1
            writeRow(idr, x)

if __name__ == "__main__": main()

score 2 · Accepted Answer

如果您想将其拆分为 2 个文件，则有一个非常简单的方法，例如：

with open("myInputFile.txt",'r') as file:
    lines = file.readlines()

with open("OutputFile1.txt",'w') as file:
    for line in lines[:int(len(lines)/2)]:
        file.write(line)

with open("OutputFile2.txt",'w') as file:
    for line in lines[int(len(lines)/2):]:
        file.write(line)

使这种动态将是：

with open("inputFile.txt",'r') as file:
    lines = file.readlines()

Batch = 10
end = 0
for i in range(1,Batch + 1):
    if i == 1:
        start = 0
    increase = int(len(lines)/Batch)
    end = end + increase
    with open("splitText_" + str(i) + ".txt",'w') as file:
        for line in lines[start:end]:
            file.write(line)
    
    start = end

score 0 · Accepted Answer

我必须对 650000 行文件做同样的事情。

将枚举索引和整数 div 与块大小一起使用 (//)

当该数字更改时，关闭当前文件并打开一个新文件

这是一个使用格式字符串的 python3 解决方案。

chunk = 50000  # number of lines from the big file to put in small file
this_small_file = open('./a_folder/0', 'a')

with open('massive_web_log_file') as file_to_read:
    for i, line in enumerate(file_to_read.readlines()):
        file_name = f'./a_folder/{i // chunk}'
        print(i, file_name)  # a bit of feedback that slows the process down a

        if file_name == this_small_file.name:
            this_small_file.write(line)

        else:
            this_small_file.write(line)
            this_small_file.close()
            this_small_file = open(f'{file_name}', 'a')

score 0 · Accepted Answer

在 Python 文件中是简单的迭代器。这提供了对它们进行多次迭代的选项，并且总是从前一个迭代器获得的最后一个位置继续。牢记这一点，我们可以使用islice连续循环每次获取文件的下 300 行。棘手的部分是知道何时停止。为此，我们将为该next行“采样”文件，一旦用尽，我们就可以break循环：

from itertools import islice

lines_per_file = 300
with open("really_big_file.txt") as file:
    i = 1
    while True:
        try:
            checker = next(file)
        except StopIteration:
            break
        with open(f"small_file_{i*lines_per_file}.txt", 'w') as out_file:
            out_file.write(checker)
            for line in islice(file, lines_per_file-1):
                out_file.write(line)
        i += 1

score 0 · Accepted Answer

将文件设置为要将主文件拆分为的文件数在我的示例中我想从我的主文件中获取 10 个文件

files = 10
with open("data.txt","r") as data :
    emails = data.readlines()
    batchs = int(len(emails)/10)
    for id,log in enumerate(emails):
        fileid = id/batchs
        file=open("minifile{file}.txt".format(file=int(fileid)+1),'a+')
        file.write(log)

score 0 · Accepted Answer

with open('/really_big_file.txt') as infile:
    file_line_limit = 300
    counter = -1
    file_index = 0
    outfile = None
    for line in infile.readlines():
        counter += 1
        if counter % file_line_limit == 0:
            # close old file
            if outfile is not None:
                outfile.close()
            # create new file
            file_index += 1
            outfile = open('small_file_%03d.txt' % file_index, 'w')
        # write to file
        outfile.write(line)

python - 使用 Python 按行号将大文本文件拆分为较小的文本文件

10 回答 10

Related

Reference