python - 优化 BIG .csv 文件的搜索

Question

我正在编写一个 Python 脚本，该脚本将从 .csv 文件中获取字符串并以特定方式剪切它们（“在 R 和 K 后面没有 P 时剪切”），允许丢失最多两个切割位点并写入结果字符串到一个新的 .csv 文件。这就像一个魅力，但......

然后我需要获取这些字符串中的每一个并搜索另一个 .csv 文件（大约有 725000 个条目），看看这些字符串是否列在大文件中。如果是，则将它们写入单独的文件。我已经成功地做到了这一点（见下面的代码），但它非常慢......我将大文件切断为大约 2000 个条目而不是 725000 个条目，并且花了 15 秒（意味着整个文件大约需要 90 分钟）。这太慢了！我怎样才能减少这个计算时间？

import csv
import re
import time

# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
in_file = open('Tryptic Sequences Input.csv','r')
in_file1 = open('Reference Peptides (ENSG, Björn) TEST.csv','r')
out_file = open('Tryptic Sequences Output.csv','w+')
out_file1 = open('Tryptic Sequences Output (non-unique peptides).csv','w+')

# Reader/Writer iterables
reader = csv.reader(in_file)
reader1 = csv.reader(in_file1)
in_list = list(reader)
in_list1 = list(reader1)
writer = csv.writer(out_file)
writer1 = csv.writer(out_file1)
headers = ('PrEST','Peptide')
writer.writerow(headers)
writer1.writerow(headers)

# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Copy_list = [] # List for non-unique tryptic peptides
Copy_ID_list = []
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages
MC1 = 'N'
MC2 = 'N'
Unique = 'Y'

t0 = time.clock()

# ------ Main PrEST for-loop -------
for row in range(len(in_list)): # For every PrEST (row)
    First = 'Y'
    PrEST_seq = in_list[row][1]

    # -------- Main AA-reader for-loop --------
    for n in range(len(PrEST_seq)): # For every AA in every PrEST

        if ((PrEST_seq[n:n+1] == 'R' or
             PrEST_seq[n:n+1] == 'K') and
             PrEST_seq[n+1:n+2] != 'P'):
            if First != 'Y': # Does not count first peptide + MCs (part of ABP)
                Peptide += PrEST_seq[n:n+1]
                if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA

 # KEY PART ---------------------------------------------------------------------

                    # Searches for non-unique peptides from in_file1
                    for line in range(len(in_list1)):
                        if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide,in_list1[line][2]) != None:
                            Unique = 'N'
                            Copy_ID_list.append(in_list[row][0])
                            Copy_list.append(Peptide)
                            break
                    if Unique == 'Y':
                        ID_list.append(in_list[row][0])
                        Peptide_list.append(Peptide)

 # (repeated twice below) --------------------------------------------------------

                Unique = 'Y' # Resets variable

                # -------- One missed cleavage while-loop --------
                Peptide_MC1 = Peptide
                m = n
                while MC1 == 'N' and m+1 <= len(PrEST_seq):
                    m += 1
                    if ((PrEST_seq[m:m+1] == 'R' or
                         PrEST_seq[m:m+1] == 'K') and
                         PrEST_seq[m+1:m+2] != 'P'):
                        Peptide_MC1 += PrEST_seq[m:m+1]
                        if len(Peptide_MC1) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC1,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC1)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC1)
                        Unique = 'Y'
                        MC1 = 'Y'
                    else:
                        Peptide_MC1 += PrEST_seq[m:m+1]
                    # ------------- End MC1 while-loop ------------

                # -------- Two missed cleavages while-loop --------
                Peptide_MC2 = Peptide_MC1
                k = m
                while MC2 == 'N' and k+1 <= len(PrEST_seq):
                    k += 1
                    if ((PrEST_seq[k:k+1] == 'R' or
                         PrEST_seq[k:k+1] == 'K') and
                         PrEST_seq[k+1:k+2] != 'P'):
                        Peptide_MC2 += PrEST_seq[k:k+1]
                        if len(Peptide_MC2) >= 6:

                            for line in range(len(in_list1)):
                                if re.search(r"\b(?=\w)%s\b(?!\w)" % Peptide_MC2,in_list1[line][2]) != None:
                                    Unique = 'N'
                                    Copy_ID_list.append(in_list[row][0])
                                    Copy_list.append(Peptide_MC2)
                                    break
                            if Unique == 'Y':
                                ID_list.append(in_list[row][0])
                                Peptide_list.append(Peptide_MC2)
                        Unique = 'Y'
                        MC2 = 'Y'
                    else:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                    # ------------ End MC2 while-loop -------------

                # Resets variables
                Peptide = ''
                Peptide_MC1 = ''
                Peptide_MC2 = ''
                MC1 = 'N'
                MC2 = 'N'
            elif First == 'Y': # Doesn't count first cleavage (contains ABP)
                Peptide = ''
                First = 'N'
        else: # Non-cleavable AAs - Peptide grows
            Peptide += PrEST_seq[n:n+1]

        # ------- End main AA-reader for-loop --------

    Peptide_list.sort(key=len, reverse=True) # Sorts list by length
    for j in range(len(Peptide_list)): # Writes current PrEST to file
        Collected_list = (ID_list[j],Peptide_list[j])
        writer.writerow(Collected_list)
    Peptide_list = []
    ID_list = []

    Copy_list.sort(key=len, reverse=True)
    for j in range(len(Copy_list)):
        Collected_list = (Copy_ID_list[j],Copy_list[j])
        writer1.writerow(Collected_list)
    Copy_list = []
    Copy_ID_list = []

    # ----- End main PrEST-reader for-loop -----      
print('------- Finished -------')
print('Total time',time.clock()-t0,'seconds')
out_file.close()
out_file1.close()

总的来说，我对 Python 和编程还很陌生，而且我很确定我的代码在很多方面都缺乏。如果我不包括对大 .csv 文件的搜索，它会非常快，但我有点需要那部分。我不知道它是否可以在搜索部分更快，或者在其他地方也可以更快。

python - 优化 BIG .csv 文件的搜索

0 回答 0

Related

Reference