Just split your string on 'N'
and then remove all the strings that are empty, or just contain a newline. Like this:
#!/usr/bin/env python
DNAstring = '''AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC'''
sequences = [u for u in DNAstring.split('N') if u and u != '\n']
for i, seq in enumerate(sequences):
print i
print seq.replace('\n', '') + '\n'
output
0
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACTTCTCAATGGGCAGTACATATCATCTCT
1
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACATGTGTTGC
The code snippet above also removes newlines inside the sequences using .replace('\n', '')
.
Here are a few programs that you may find useful.
Firstly, a line buffer class. You initialise it with a file name and a line width. You can then feed it random length strings and it will automatically save them to the text file, line by line, with all lines (except possibly the last line) having the given length. You can use this class in other programs to make your output look neat.
Save this file as linebuffer.py
to somewhere in your Python path; the simplest way is to save it wherever you save your Python programs and make that the current directory when you run the programs.
linebuffer.py
#! /usr/bin/env python
''' Text output buffer
Write fixed width lines to a text file
Written by PM 2Ring 2015.03.23
'''
class LineBuffer(object):
''' Text output buffer
Write fixed width lines to file fname
'''
def __init__(self, fname, width):
self.fh = open(fname, 'wt')
self.width = width
self.buff = []
self.bufflen = 0
def write(self, data):
''' Write a string to the buffer '''
self.buff.append(data)
self.bufflen += len(data)
if self.bufflen >= self.width:
self._save()
def _save(self):
''' Write the buffer to the file '''
buff = ''.join(self.buff)
#Split buff into lines
lines = []
while len(buff) >= self.width:
lines.append(buff[:self.width])
buff = buff[self.width:]
#Add an empty line so we get a trailing newline
lines.append('')
self.fh.write('\n'.join(lines))
self.buff = [buff]
self.bufflen = len(buff)
def close(self):
''' Flush the buffer & close the file '''
if self.bufflen > 0:
self.fh.write(''.join(self.buff) + '\n')
self.fh.close()
def testLB():
alpha = 'abcdefghijklmnopqrstuvwxyz'
fname = 'linebuffer_test.txt'
lb = LineBuffer(fname, 27)
for _ in xrange(30):
lb.write(alpha)
lb.write(' bye.')
lb.close()
if __name__ == '__main__':
testLB()
Here is a program that makes random DNA sequences of the form you described in your question. It uses linebuffer.py
to handle the output. I wrote this so I could test my DNA sequence splitter properly.
Random_DNA0.py
#! /usr/bin/env python
''' Make random DNA sequences
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Takes approx 3 seconds per megabyte generated and saved
on a 2GHz CPU single core machine.
Written by PM 2Ring 2015.03.23
'''
import sys
import random
from linebuffer import LineBuffer
#Set seed to None to seed randomizer from system time
random.seed(37)
#Output line width
linewidth = 120
#Subsequence base length ranges
minsub, maxsub = 15, 300
#Subsequences per sequence ranges
minseq, maxseq = 5, 50
#random 'N' sequence ranges
minn, maxn = 5, 200
#Probability that a random 'N' sequence occurs after a subsequence
randn = 0.2
#Sequence separator
nsepblock = 'N' * 1000
def main():
#Get number of sequences from the command line
numsequences = int(sys.argv[1]) if len(sys.argv) > 1 else 2
outname = 'DNA_sequence.txt'
lb = LineBuffer(outname, linewidth)
for i in xrange(numsequences):
#Write the 1000*'N' separator between sequences
if i > 0:
lb.write(nsepblock)
for j in xrange(random.randint(minseq, maxseq)):
#Possibly make a short run of 'N's in the sequence
if j > 0 and random.random() < randn:
lb.write(''.join('N' * random.randint(minn, maxn)))
#Create a single subsequence
r = xrange(random.randint(minsub, maxsub))
lb.write(''.join([random.choice('ACGT') for _ in r]))
lb.close()
if __name__ == '__main__':
main()
Finally, we have a program that splits your random DNA sequences. Once again, it uses linebuffer.py
to handle the output.
DNA_Splitter0.py
#! /usr/bin/env python
''' Split DNA sequences and save to separate files
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Written by PM 2Ring 2015.03.23
'''
import sys
from linebuffer import LineBuffer
#Output line width
linewidth = 120
#Sequence separator
nsepblock = 'N' * 1000
def main():
iname = 'DNA_sequence.txt'
outbase = 'contig'
with open(iname, 'rt') as f:
data = f.read()
#Remove all newlines
data = data.replace('\n', '')
sequences = data.split(nsepblock)
#Save each sequence to a series of files
for i, seq in enumerate(sequences, 1):
outname = '%s%05d' % (outbase, i)
print outname
#Write sequence data, with line breaks
lb = LineBuffer(outname, linewidth)
lb.write(seq)
lb.close()
if __name__ == '__main__':
main()