How To Randomly Extract Fasta Sequences Using Python?

February 28, 2024 Post a Comment

I have the following sequences which is in a fasta format with sequence header and its nucleotides. How can I randomly extract the sequences. For example I would like to randomly s

Solution 1:

If you are working with fasta files use BioPython, to get n sequences use random.sample:

from Bio import SeqIO
from random import sample
withopen("foo.fasta") as f:
    seqs = SeqIO.parse(f,"fasta")
    print(sample(list(seqs), 2))

Output:

[SeqRecord(seq=Seq('GAGATCGTCCGGGACCTGGGT', SingleLetterAlphabet()), id='chr1:1154147-1154167', name='chr1:1154147-1154167', description='chr1:1154147-1154167', dbxrefs=[]), SeqRecord(seq=Seq('GTCCGCTTGCGGGACCTGGGG', SingleLetterAlphabet()), id='chr1:983001-983021', name='chr1:983001-983021', description='chr1:983001-983021', dbxrefs=[])]

You can extract the strings if necessary:

print([(seq.name,str(seq.seq)) for seq in  sample(list(seqs),2)])
 [('chr1:1310706-1310726', 'GACGGTTTCCGGTTAGTGGAA'), ('chr1:983001-983021', 'GTCCGCTTGCGGGACCTGGGG')]

If the lines were always in pairs and you skipped the metadata at the top you could zip:

from random import sample

withopen("foo.fasta") as f:
    print(sample(list(zip(f, f)), 2))

Which will give you pairs of lines in tuples:

[('>chr1:983001-983021\n', 'GTCCGCTTGCGGGACCTGGGG\n'), ('>chr1:984333-984353\n', 'CTGGAATTCCGGGCGCTGGAG\n')]

To get the lines ready to be written:

from Bio import SeqIO
from random import sample
withopen("foo.fasta") as f:
    seqs = SeqIO.parse(f, "fasta")
    samps = ((seq.name, seq.seq) for seq in  sample(list(seqs),2))
    for samp in samps:
        print(">{}\n{}".format(*samp))

Output:

>chr1:1310706-1310726
GACGGTTTCCGGTTAGTGGAA
>chr1:983001-983021
GTCCGCTTGCGGGACCTGGGG

Solution 2:

Given the file format that you have shown, and assuming that the file is not too large, you don't need any external module (e.g. biopython) to do this:

import random

withopen('A.fasta') as f:
    data = f.read().splitlines()
    for i in random.sample(range(0, len(data), 2), 2):
        print data[i]
        print data[i+1]

Example output:

>chr1:984333-984353
CTGGAATTCCGGGCGCTGGAG
>chr1:901959-901979
GAGGGCTTTCTGGAGAAGGAG

This simply selects 2 random sequence headers (those lines from A.fasta with even indices in data) and the line following it.

If your file is large then external modules might have optimisations to cope with larger data sets.

Solution 3:

Don't know much about Fasta, but Python has a Fasta module (you need to install it first).

>>> from pyfasta import Fasta

>>> f = Fasta('tests/test1.fasta')
>>> sorted(f.keys())
['chr1', 'chr2', 'chr3']

Then you can use the sample function from Python's Random module and pick as many as you want at random...

from random import sample
sample(f, how_many_you_want)

Solution 4:

import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

# Use: python   scriptname.py   number_of_random_seq   infile.fasta   outfile.fasta

infile = sys.argv[2]                                #Name of the input file
seq = list(SeqIO.parse(infile,"fasta"))             #Create a list with all the sequence recordsprint"Input fasta file = ", infile

totseq = len(seq)                                   #Total number of sequences in the input fileprint"Number of sequences in the original file = ", totseq

randseq = int(sys.argv[1])                          #Number of random sequences desiredprint"Number of random sequences desired = ", randseq

if randseq > totseq:
  print"The requested number of random sequences is greater that the total number of input sequences. Exiting."
  exit()

outfile = sys.argv[3]                               #Name of the output fileprint"Output fasta file = ", outfile

outrandseq = []
outlist = []
print"Randomly chosen output sequences:"for i inrange(randseq):
  choose = random.randint(1,totseq-1)               #Choose a random sequence record numberfor j inrange(len(outrandseq)):                  #Test to see if the random sequence record number has already been chosenif choose == outrandseq[j]:
      choose = random.randint(1,totseq-1)           #Choose a new random sequence record number if the current one has already been chosen
  outrandseq.append(choose)
  print choose
  outseq = seq[choose]
  outlist.append(outseq)                            #Append seq record to output list

SeqIO.write(outlist, outfile, "fasta")              #Write the output list to the outfile

exit()

Solution 5:

import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

# I use this from small numbers of sequences (input file up to 10000 sequences) and it works fine. # For very large sequence sets it may be too slow -- I just have not tried.# Use: python   scriptname.py   number_of_random_seq   infile.fasta   outfile.fasta

infile = sys.argv[2]                                #Name of the input file
seq = list(SeqIO.parse(infile,"fasta"))             #Create a list with all the sequence recordsprint"Input fasta file = ", infile

totseq = len(seq)                                   #Total number of sequences in the input fileprint"Number of sequences in the original file = ", totseq

numrandseq = int(sys.argv[1])                       #Number of random sequences desiredprint"Number of random sequences desired = ", numrandseq

if numrandseq > totseq:
  print"The requested number of random sequences is greater that the total number of input sequences. Exiting."
  exit()

outfile = sys.argv[3]                               #Name of the output fileprint"Output fasta file = ", outfile

outrandseqset = []
i = 1for i inrange(numrandseq):                         #Create a list of random sequence record numbers for output
  choice = random.randint(1,totseq)
  outrandseqset.append(choice)

i = 1
j = 1
duplicate = 1while duplicate:                                    #Make sure no sequences are duplicated in the list
    duplicate = 0for i inrange(numrandseq):
      for j inrange(i+1, numrandseq):
        if outrandseqset[i] == outrandseqset[j]:
            outrandseqset[j] = random.randint(1,totseq)
            duplicate = 1


i = 1print"Randomly chosen output sequences:"for i inrange(numrandseq):
  print outrandseqset[i]

outlist = []
i = 1for i inrange(numrandseq):                         #Create the list of seq records to be written to the output file
  seqnum = outrandseqset[i]
  outseq = seq[seqnum]
  outlist.append(outseq)

SeqIO.write(outlist, outfile, "fasta")              #Write the output list to the outfile

exit()

Python Channel