python - Extract Values between two strings in a text file using python

Question

Lets say I have a Text file with the below content

fdsjhgjhg
fdshkjhk
Start
Good Morning
Hello World
End
dashjkhjk
dsfjkhk

Now I need to write a Python code which will read the text file and copy the contents between Start and end to another file.

I wrote the following code.

inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
keepCurrentSet = True
for line in inFile:
    buffer.append(line)
    if line.startswith("Start"):
        #---- starts a new data set
        if keepCurrentSet:
            outFile.write("".join(buffer))
        #now reset our state
        keepCurrentSet = False
        buffer = []
    elif line.startswith("End"):
        keepCurrentSet = True
inFile.close()
outFile.close()

I'm not getting the desired output as expected I'm just getting Start What I want to get is all the lines between Start and End. Excluding Start & End.

score 51 · Accepted Answer

以防万一您的文本文件中有多个“开始”和“结束”，这会将所有数据一起导入，不包括所有“开始”和“结束”。

with open('path/to/input') as infile, open('path/to/output', 'w') as outfile:
    copy = False
    for line in infile:
        if line.strip() == "Start":
            copy = True
            continue
        elif line.strip() == "End":
            copy = False
            continue
        elif copy:
            outfile.write(line)

score 7 · Accepted Answer

如果文本文件不一定很大，您可以获取文件的全部内容，然后使用正则表达式：

import re
with open('data.txt') as myfile:
    content = myfile.read()

text = re.search(r'Start\n.*?End', content, re.DOTALL).group()
with open("result.txt", "w") as myfile2:
    myfile2.write(text)

score 5 · Accepted Answer

我不是 Python 专家，但这段代码应该可以完成这项工作。

inFile = open("data.txt")
outFile = open("result.txt", "w")
keepCurrentSet = False
for line in inFile:
    if line.startswith("End"):
        keepCurrentSet = False

    if keepCurrentSet:
        outFile.write(line)

    if line.startswith("Start"):
        keepCurrentSet = True
inFile.close()
outFile.close()

score 5 · Accepted Answer

使用itertools.dropwhile, itertools.takewhile, itertools.islice:

import itertools

with open('data.txt') as f, open('result.txt', 'w') as fout:
    it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
    it = itertools.islice(it, 1, None)
    it = itertools.takewhile(lambda line: line.strip() != 'End', it)
    fout.writelines(it)

更新：正如inspectorG4dget 评论的那样，上面的代码复制了第一个块。要复制多个块，请使用以下命令：

import itertools

with open('data.txt', 'r') as f, open('result.txt', 'w') as fout:
    while True:
        it = itertools.dropwhile(lambda line: line.strip() != 'Start', f)
        if next(it, None) is None: break
        fout.writelines(itertools.takewhile(lambda line: line.strip() != 'End', it))

score 3 · Accepted Answer

将outFile.write呼叫移至第二个if：

inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer = []
for line in inFile:
    if line.startswith("Start"):
        buffer = ['']
    elif line.startswith("End"):
        outFile.write("".join(buffer))
        buffer = []
    elif buffer:
        buffer.append(line)
inFile.close()
outFile.close()

score 2 · Accepted Answer

import re

inFile = open("data.txt")
outFile = open("result.txt", "w")
buffer1 = ""
keepCurrentSet = True
for line in inFile:
    buffer1=buffer1+(line)

buffer1=re.findall(r"(?<=Start) (.*?) (?=End)", buffer1)  
outFile.write("".join(buffer1))  
inFile.close()
outFile.close()

score 1 · Accepted Answer

我会这样处理：

inFile = open("data.txt")
outFile = open("result.txt", "w")

data = inFile.readlines()

outFile.write("".join(data[data.index('Start\n')+1:data.index('End\n')]))
inFile.close()
outFile.close()

score 0 · Accepted Answer

如果想在提取两个字符串之间的行时保留开始行和结束行/关键字。

请在下面找到我用来从 shell 脚本中提取 sql 语句的代码片段

def process_lines(in_filename, out_filename, start_kw, end_kw):
    try:
        inp = open(in_filename, 'r', encoding='utf-8', errors='ignore')
        out = open(out_filename, 'w+', encoding='utf-8', errors='ignore')
    except FileNotFoundError as err:
        print(f"File {in_filename} not found", err)
        raise
    except OSError as err:
        print(f"OS error occurred trying to open {in_filename}", err)
        raise
    except Exception as err:
        print(f"Unexpected error opening {in_filename} is",  repr(err))
        raise
    else:
        with inp, out:
            copy = False
            for line in inp:
                # first IF block to handle if the start and end on same line
                if line.lstrip().lower().startswith(start_kw) and line.rstrip().endswith(end_kw):
                    copy = True
                    if copy:  # keep the starts with keyword
                        out.write(line)
                    copy = False
                    continue
                elif line.lstrip().lower().startswith(start_kw):
                    copy = True
                    if copy:  # keep the starts with keyword
                        out.write(line)
                    continue
                elif line.rstrip().endswith(end_kw):
                    if copy:  # keep the ends with keyword
                        out.write(line)
                    copy = False
                    continue
                elif copy:
                    # write
                    out.write(line)


if __name__ == '__main__':
    infile = "/Users/testuser/Downloads/testdir/BTEQ_TEST.sh"
    outfile = f"{infile}.sql"
    statement_start_list = ['database', 'create', 'insert', 'delete', 'update', 'merge', 'delete']
    statement_end = ";"
    process_lines(infile, outfile, tuple(statement_start_list), statement_end)

score 0 · Accepted Answer

文件是 Python 中的迭代器，因此这意味着您不需要保存“标志”变量来告诉您要编写哪些行。您可以在到达起始行时简单地使用另一个循环，并在到达结束行时将其中断：

with open("data.txt") as in_file, open("result.text", 'w') as out_file:
    for line in in_file:
        if line.strip() == "Start":
            for line in in_file:
                if line.strip() == "End":
                    break
                out_file.write(line)

python - Extract Values between two strings in a text file using python

9 回答 9

Related

Reference