Skip to content

Commit

Permalink
Add stop codon to index
Browse files Browse the repository at this point in the history
  • Loading branch information
saketkc committed May 2, 2023
1 parent 7d11a8a commit 855d66e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 10 deletions.
16 changes: 13 additions & 3 deletions ribotricer/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
strand,
intervals,
seq="",
stop_codon="",
leader="",
trailer="",
):
Expand Down Expand Up @@ -63,6 +64,13 @@ def start_codon(self):
if len(self.seq) < 3:
return None
return self.seq[:3]

@property
def stop_codon(self):
"""Return the first 3 bases from sequence"""
if len(self.seq) < 3:
return None
return self.seq[-3:]

@classmethod
def from_string(cls, line):
Expand All @@ -81,7 +89,7 @@ def from_string(cls, line):
print("annotation line cannot be empty")
return None
fields = line.split("\t")
if len(fields) != 11:
if len(fields) != 12:
sys.exit(
"{}\n{}".format(
"Error: unexpected number of columns found for index file",
Expand All @@ -99,7 +107,8 @@ def from_string(cls, line):
chrom = fields[7]
strand = fields[8]
start_codon = fields[9]
coordinate = fields[10]
stop_codon = fields[10]
coordinate = fields[11]
intervals = []
for group in coordinate.split(","):
start, end = group.split("-")
Expand All @@ -117,6 +126,7 @@ def from_string(cls, line):
strand,
intervals,
seq=start_codon,
stop_codon=stop_codon
)

@classmethod
Expand All @@ -127,7 +137,7 @@ def from_tracks(cls, tracks, category, seq="", leader="", trailer=""):
tracks: list of GTFTrack
This method uses a fail-fast stragy and hence multiple
returns. It ultimately retulrs an object correponding to the
returns. It ultimately returns an object correponding to the
parsed line.
"""
if not tracks:
Expand Down
22 changes: 15 additions & 7 deletions ribotricer/prepare_orfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def search_orfs(fasta, intervals, min_orf_length, start_codons, stop_codons, lon
for m in re.finditer(alternative_start_regx, merged_seq)
]
stop_regx = re.compile(r"(?=({}))".format("|".join(stop_codons)))
#stop_regx = re.compile(r"{}".format("|".join(stop_codons)))
start_stop_idx += [(m.start(0), "stop") for m in re.finditer(stop_regx, merged_seq)]
start_stop_idx.sort(key=lambda x: x[0])
for frame in [0, 1, 2]:
Expand All @@ -199,15 +200,20 @@ def search_orfs(fasta, intervals, min_orf_length, start_codons, stop_codons, lon
starts.append(idx)
elif starts:
for start in starts:
if idx - start >= min_orf_length:
# if ORF is not proper length, continue
if (idx - start) % 3 !=0:
print(idx-start)
continue
if (idx - start >= min_orf_length) and (idx - start) % 3 == 0:
ivs = transcript_to_genome_iv(
start, idx - 1, intervals, reverse
)
seq = merged_seq[start:idx]
seq = merged_seq[start:(idx+3)]
stop_codon = merged_seq[idx:(idx+3)]
leader = merged_seq[:start]
trailer = merged_seq[idx + 3 :]
if ivs:
orfs.append((ivs, seq, leader, trailer))
orfs.append((ivs, seq, stop_codon, leader, trailer))
if longest:
break
starts = []
Expand Down Expand Up @@ -299,7 +305,7 @@ def prepare_orfs(
for tid in gtf.cds[gid]:
tracks = gtf.cds[gid][tid]
seq = fetch_seq(fasta, tracks)
orf = ORF.from_tracks(tracks, "annotated", seq=seq[:3])
orf = ORF.from_tracks(tracks, "annotated", seq=seq)
if orf:
cds_orfs[gid][tid] = orf
candidate_orfs.append(orf)
Expand All @@ -323,7 +329,7 @@ def prepare_orfs(
orfs = search_orfs(
fasta, ivs, min_orf_length, start_codons, stop_codons, longest
)
for ivs, seq, leader, trailer in orfs:
for ivs, seq, stop_codon, leader, trailer in orfs:
orf = ORF(
"unknown",
tid,
Expand All @@ -334,7 +340,7 @@ def prepare_orfs(
chrom,
strand,
ivs,
seq=seq[:3],
seq=seq
)
orf.category = check_orf_type(orf, cds_orfs)
if orf.category != "annotated" and orf.category != "internal":
Expand All @@ -354,6 +360,7 @@ def prepare_orfs(
"chrom",
"strand",
"start_codon",
"stop_codon",
"coordinate\n",
]

Expand All @@ -375,7 +382,8 @@ def prepare_orfs(
orf.chrom,
orf.strand,
orf.start_codon,
coordinate,
orf.stop_codon,
coordinate
)

with open("{}_candidate_orfs.tsv".format(prefix), "w") as output:
Expand Down

0 comments on commit 855d66e

Please sign in to comment.