How To Randomly Extract Fasta Sequences Using Python?
Solution 1:
If you are working with fasta files use BioPython, to get n sequences use random.sample:
from Bio import SeqIO
from random import sample
withopen("foo.fasta") as f:
seqs = SeqIO.parse(f,"fasta")
print(sample(list(seqs), 2))
Output:
[SeqRecord(seq=Seq('GAGATCGTCCGGGACCTGGGT', SingleLetterAlphabet()), id='chr1:1154147-1154167', name='chr1:1154147-1154167', description='chr1:1154147-1154167', dbxrefs=[]), SeqRecord(seq=Seq('GTCCGCTTGCGGGACCTGGGG', SingleLetterAlphabet()), id='chr1:983001-983021', name='chr1:983001-983021', description='chr1:983001-983021', dbxrefs=[])]
You can extract the strings if necessary:
print([(seq.name,str(seq.seq)) for seq in sample(list(seqs),2)])
[('chr1:1310706-1310726', 'GACGGTTTCCGGTTAGTGGAA'), ('chr1:983001-983021', 'GTCCGCTTGCGGGACCTGGGG')]If the lines were always in pairs and you skipped the metadata at the top you could zip:
from random import sample
withopen("foo.fasta") as f:
print(sample(list(zip(f, f)), 2))
Which will give you pairs of lines in tuples:
[('>chr1:983001-983021\n', 'GTCCGCTTGCGGGACCTGGGG\n'), ('>chr1:984333-984353\n', 'CTGGAATTCCGGGCGCTGGAG\n')]To get the lines ready to be written:
from Bio import SeqIO
from random import sample
withopen("foo.fasta") as f:
seqs = SeqIO.parse(f, "fasta")
samps = ((seq.name, seq.seq) for seq in sample(list(seqs),2))
for samp in samps:
print(">{}\n{}".format(*samp))
Output:
>chr1:1310706-1310726
GACGGTTTCCGGTTAGTGGAA
>chr1:983001-983021
GTCCGCTTGCGGGACCTGGGG
Solution 2:
Given the file format that you have shown, and assuming that the file is not too large, you don't need any external module (e.g. biopython) to do this:
import random
withopen('A.fasta') as f:
data = f.read().splitlines()
for i in random.sample(range(0, len(data), 2), 2):
print data[i]
print data[i+1]
Example output:
>chr1:984333-984353 CTGGAATTCCGGGCGCTGGAG >chr1:901959-901979 GAGGGCTTTCTGGAGAAGGAG
This simply selects 2 random sequence headers (those lines from A.fasta with even indices in data) and the line following it.
If your file is large then external modules might have optimisations to cope with larger data sets.
Solution 3:
Don't know much about Fasta, but Python has a Fasta module (you need to install it first).
>>> from pyfasta import Fasta
>>> f = Fasta('tests/test1.fasta')
>>> sorted(f.keys())
['chr1', 'chr2', 'chr3']
Then you can use the sample function from Python's Random module and pick as many as you want at random...
from random import sample
sample(f, how_many_you_want)Solution 4:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
# Use: python scriptname.py number_of_random_seq infile.fasta outfile.fasta
infile = sys.argv[2] #Name of the input file
seq = list(SeqIO.parse(infile,"fasta")) #Create a list with all the sequence recordsprint"Input fasta file = ", infile
totseq = len(seq) #Total number of sequences in the input fileprint"Number of sequences in the original file = ", totseq
randseq = int(sys.argv[1]) #Number of random sequences desiredprint"Number of random sequences desired = ", randseq
if randseq > totseq:
print"The requested number of random sequences is greater that the total number of input sequences. Exiting."
exit()
outfile = sys.argv[3] #Name of the output fileprint"Output fasta file = ", outfile
outrandseq = []
outlist = []
print"Randomly chosen output sequences:"for i inrange(randseq):
choose = random.randint(1,totseq-1) #Choose a random sequence record numberfor j inrange(len(outrandseq)): #Test to see if the random sequence record number has already been chosenif choose == outrandseq[j]:
choose = random.randint(1,totseq-1) #Choose a new random sequence record number if the current one has already been chosen
outrandseq.append(choose)
print choose
outseq = seq[choose]
outlist.append(outseq) #Append seq record to output list
SeqIO.write(outlist, outfile, "fasta") #Write the output list to the outfile
exit()
Solution 5:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
# I use this from small numbers of sequences (input file up to 10000 sequences) and it works fine. # For very large sequence sets it may be too slow -- I just have not tried.# Use: python scriptname.py number_of_random_seq infile.fasta outfile.fasta
infile = sys.argv[2] #Name of the input file
seq = list(SeqIO.parse(infile,"fasta")) #Create a list with all the sequence recordsprint"Input fasta file = ", infile
totseq = len(seq) #Total number of sequences in the input fileprint"Number of sequences in the original file = ", totseq
numrandseq = int(sys.argv[1]) #Number of random sequences desiredprint"Number of random sequences desired = ", numrandseq
if numrandseq > totseq:
print"The requested number of random sequences is greater that the total number of input sequences. Exiting."
exit()
outfile = sys.argv[3] #Name of the output fileprint"Output fasta file = ", outfile
outrandseqset = []
i = 1for i inrange(numrandseq): #Create a list of random sequence record numbers for output
choice = random.randint(1,totseq)
outrandseqset.append(choice)
i = 1
j = 1
duplicate = 1while duplicate: #Make sure no sequences are duplicated in the list
duplicate = 0for i inrange(numrandseq):
for j inrange(i+1, numrandseq):
if outrandseqset[i] == outrandseqset[j]:
outrandseqset[j] = random.randint(1,totseq)
duplicate = 1
i = 1print"Randomly chosen output sequences:"for i inrange(numrandseq):
print outrandseqset[i]
outlist = []
i = 1for i inrange(numrandseq): #Create the list of seq records to be written to the output file
seqnum = outrandseqset[i]
outseq = seq[seqnum]
outlist.append(outseq)
SeqIO.write(outlist, outfile, "fasta") #Write the output list to the outfile
exit()
Post a Comment for "How To Randomly Extract Fasta Sequences Using Python?"