/
blot.py
147 lines (119 loc) · 5.35 KB
/
blot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import re
import os
import sys
import getopt
import subprocess
from ete2 import Tree
from Bio import SeqIO
from Bio import Entrez
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio.Phylo.Consensus import *
from Bio.SeqRecord import SeqRecord
from StringIO import StringIO as SIO
def get_seqs(t, gene, email, seqs_per_taxon, corrections, max_length, names_and_numbers) :
Entrez.email = email
## get list of gi values that match organism and gene
if t in corrections.keys() :
if corrections[t].split(",")[1] == "organism" :
term = corrections[t].split(",")[0] + "[Organism] AND " + gene + "[Gene]"
elif corrections[t].split(",")[1] == "accession" :
term = corrections[t].split(",")[0] + "[Accession]"
else :
term = t + "[Organism] AND " + gene + "[Gene]"
handle = Entrez.esearch(db="nucleotide", term=term)
record = Entrez.read(handle)
if len(record['IdList']) < seqs_per_taxon :
print "Warning: there are fewer than %s %s sequences available for %s. Using all %s available." %(seqs_per_taxon, gene, t, len(record['IdList']))
## get sequence data
seq_recs = []
for gi in record['IdList'] :
handle = Entrez.efetch(db="nucleotide", id=gi ,rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
if len(seq_recs) <= seqs_per_taxon and len(record.seq) <= max_length :
seq_recs.append(record)
# TODO: filter sequences for quality in some way
# build file object to pass to muscle for alignment
output = open("fasta/"+t.replace(" ", "_")+"_"+gene+"_seqs.fas", 'w')
SeqIO.write(seq_recs, output, "fasta")
return len(seq_recs)
## parameters
gene = "ND2"
email = "test@example.com"
max_length = 2000
seqs_per_taxon = 3
tree_file_name = 'examples/centarchidae.tre'
corrections_file_name = 'examples/corrections.csv'
max_muscle_iters = "3"
max_muscle_trees = "2"
## get leaf names from newick
os.mkdir("fasta")
tree_handle = open(tree_file_name, 'r')
print "Parsing newick file %s..." %tree_file_name
for line in tree_handle :
newick = line.strip('\n')+";"
ott_ids = {}
names_and_numbers = {}
taxa_names = []
tree = Tree(newick, format=8)
for leaf in tree :
name = re.sub(r'_ott.*?$', '', leaf.name)
name = name.replace("_", " ")
ott_id = leaf.name.split("_ott")[1]
ott_ids[name] = ott_id
taxa_names.append(name.replace("'", ""))
tree.write(format=9, outfile="examples/constraint_tree.tre")
## check for, and process a corrections .csv file here if present
corrections_file = open(corrections_file_name, 'r')
corrections = {}
print "Loading corrections corrections file..."
for line in corrections_file :
rec = line.strip("\n").split(",")
corrections[rec[0]] = rec[1] + "," + rec[2].replace(" ", "").strip()
## get sequence data for X number of records for gene X
## and build consensus sequence of those sequences for each taxa
leaf_con_seqs = {}
con_seqs = []
for t in taxa_names :
# get sequences from genbank for all taxa, build fasta style file object and return
num_res = get_seqs(t, gene, email, seqs_per_taxon, corrections, max_length, names_and_numbers)
## build alignment for those sequences
if num_res > 1 :
seqs_f = "fasta/"+t.replace(" ", "_") + "_" + gene + "_seqs.fas"
align_f = "fasta/"+t.replace(" ", "_") + "_" + gene + "_alignment.fas"
subprocess.call(["muscle", "-in", seqs_f, "-out", align_f, "-maxiters", max_muscle_iters, "-quiet"])
msa = AlignIO.read(align_f, "fasta")
msa_summary = AlignInfo.SummaryInfo(msa)
consensus = msa_summary.dumb_consensus()
# build seqrec object here and add to con_seqs
record = SeqRecord(Seq(str(consensus), IUPAC.ambiguous_dna), id=t.replace(" ", "_")+"_ott"+ott_ids[t], description="")
con_seqs.append(record)
## get all the singles
else :
fasta_f = "fasta/"+t.replace(" ", "_") + "_" + gene + "_seqs.fas"
handle = open(fasta_f, "rU")
for record in SeqIO.parse(handle, "fasta") :
record.id = t.replace(" ", "_")+"_ott"+ott_ids[t]
record.description = ""
con_seqs.append(record)
## write all species seqs to fasta
out_fasta_file = tree_file_name.split(".")[0] + "_" + gene + "_sequences.fas"
out_fasta = open(out_fasta_file, 'w')
print "Writing %s raw sequences fasta file to %s..." %(len(con_seqs), out_fasta_file)
SeqIO.write(con_seqs, out_fasta, "fasta")
out_fasta.close()
## build alignment of new fasta
print "Building alignment of %s..." %out_fasta_file
final_msa_file = tree_file_name.split("/")[1].replace(".tre", "") + "_" + gene + "_alignment.fas"
subprocess.call(["muscle", "-in", out_fasta_file, "-diags", "-maxiters", max_muscle_iters,
"-maxtrees", max_muscle_trees, "-out", final_msa_file])
## convert to relaxed phylip here
temp_alignment = AlignIO.read(final_msa_file, "fasta")
AlignIO.write(temp_alignment, final_msa_file.replace(".fas", ".phy"), "phylip-relaxed")
## build a ML tree with new alignment
print "Building final tree with constrained topology to estimate branch lengths..."
subprocess.call(["phyml", "-i", final_msa_file.replace(".fas", ".phy"), "-u", "examples/constraint_tree.tre", "-d" "nt", "-m", "GTR", "-o", "l"])
## write code to replace short names with original names in tree and alignment files
print "Complete."