/
validate-data.py
169 lines (139 loc) · 5.92 KB
/
validate-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
import os
import sys
import getopt
from ete2 import Tree
from Bio import SeqIO
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from StringIO import StringIO as SIO
from termcolor import colored
def check_data(gene, taxa_names, email, seqs_per_taxon, corrections, max_length) :
## Check that NCBI has data for the target gene for all leafs in tree
print "Checking that NCBI has records for all target taxa..."
ncbi_data = {}
taxon_data = {}
no_data = False
count = 1
est_seq_count = {}
for t in taxa_names :
seqs_present = True
result, est_seqs = check_seqs(t, gene, email, seqs_per_taxon, ott_ids[t], corrections, max_length)
taxon_result = check_taxon(t, email, corrections)
est_seq_count[t] = est_seqs
if result == 0 :
seqs_present = False
no_data = True
ncbi_data[t] = seqs_present
if taxon_result == 0 :
taxon_data[t] = False
if no_data == False :
print "All terminal taxa appear to have data available in Genbank for gene %s." %gene
elif no_data == True :
print "Some terminal taxa don't have data available in Genbank for the gene %s." %gene
print "Please check the gene name for errors or double check that data is available in Genbank."
print "-"*100
table_fmt = [5, 13, 12, 9]
long_key = 0
for key in ncbi_data.keys() :
if len(key) > long_key :
long_key = len(key)
print "Taxon%s\t Taxon Present \t Data Present \t # of Seqs" %(" "*(long_key-len("Taxon")))
print "%s \t %s \t %s \t %s" %("-"*13+" "*(long_key-13),"-"*13,"-"*13,"-"*13)
for key, val in ncbi_data.items() :
chars = len(key)
fmt_spaces = " " * (long_key - chars)
if val == False and ncbi_data[key] == False :
print "%s%s \t %s%s \t %s%s \t %s" %(key, fmt_spaces, colored(ncbi_data[key], 'red'), " "*8, colored(val, 'red'), " "*13, est_seq_count[key])
elif ncbi_data[key] == False :
print "%s%s \t %s%s \t %s%s \t %s" %(key, fmt_spaces, colored(ncbi_data[key], 'red'), " "*8, val, " "*13, est_seq_count[key])
elif val == False :
print "%s%s \t %s%s \t %s%s \t %s" %(key, fmt_spaces, ncbi_data[key], " "*8, colored(val, 'red'), " "*13, est_seq_count[key])
else :
print "%s%s \t %s%s \t %s%s \t %s" %(key, fmt_spaces, colored(ncbi_data[key], 'green'), " "*8, colored(val, 'green'), " "*13, est_seq_count[key])
print "-"*100
def check_seqs(t, gene, email, seqs_per_taxon, ott_id, corrections, max_length) :
seq_present = 0
Entrez.email = email
## get list of gi values that match organism and gene
if t in corrections.keys() :
if corrections[t].split(",")[1] == "organism" :
term = corrections[t].split(",")[0] + "[Organism] AND " + gene + "[Gene]"
elif corrections[t].split(",")[1] == "accession" :
term = corrections[t].split(",")[0] + "[Accession]"
else :
term = t + "[Organism] AND " + gene + "[Gene]"
handle = Entrez.esearch(db="nucleotide", term=term)
record = Entrez.read(handle)
if len(record['IdList']) > 0 :
## get seq length here
#for each id, get seq length here if there is at least one present < 2000, set value to 1.
for gi in record['IdList'] :
if seq_present == 0 :
handle = Entrez.efetch(db="nucleotide", id=gi ,rettype="gb", retmode="text")
gbrecord = SeqIO.read(handle, "genbank")
if len(gbrecord.seq) <= max_length :
seq_present = 1
else :
seq_present = 0
est_seqs = len(record['IdList'])
return seq_present, est_seqs
def check_taxon(t, email, corrections) :
seq_present = 0
Entrez.email = email
## get list of gi values that match organism and gene
if t in corrections.keys() :
if corrections[t].split(",")[1] == "organism" :
term = corrections[t].split(",")[0] + "[Organism]"
elif corrections[t].split(",")[1] == "accession" :
term = corrections[t].split(",")[0] + "[Accession]"
else :
term = t + "[Organism]"
handle = Entrez.esearch(db="taxonomy", term=term)
record = Entrez.read(handle)
if len(record['IdList']) > 0 :
seq_present = 1
else :
seq_present = 0
return seq_present
## parameters
gene = "ND2"
email = "lcoghill@fieldmuseum.org"
seqs_per_taxon = 5
tree_file_name = 'examples/centarchidae.tre'
corrections_file_name = 'examples/corrections.csv'
max_length = 2000
## get leaf names from newick
print "Looking for newick tree file...",
if os.path.isfile(tree_file_name) :
tree_handle = open(tree_file_name, 'r')
print "Found. \nParsing newick file %s..." %tree_file_name,
for line in tree_handle :
newick = line.strip('\n')+";"
print "Done."
else :
print "\nNo tree file found. Cannot continue."
sys.exit()
ott_ids = {}
taxa_names = []
tree = Tree(newick, format=9)
for leaf in tree :
name = re.sub(r'_ott.*?$', '', leaf.name)
name = name.replace("_", " ")
ott_id = leaf.name.split("_ott")[1]
ott_ids[name] = ott_id
taxa_names.append(name.replace("'", ""))
## check for, and process a corrections .csv file here if present
corrections = {}
print "Checking for a corrections file...",
if os.path.isfile(corrections_file_name) :
corrections_file = open(corrections_file_name, 'r')
print "Found. \nLoading corrections corrections file...",
for line in corrections_file :
rec = line.strip("\n").split(",")
corrections[rec[0]] = rec[1] + "," + rec[2].replace(" ", "").strip()
print "Done."
else :
print "\nNo corrections file found. Proceeding without it."
check_data(gene, taxa_names, email, seqs_per_taxon, corrections, max_length)