Merge pull request #158 from smithlabcode/fixing-linter-errors

Fixing linter errors
smithlabcode · Apr 13, 2024 · 071b5cc · 071b5cc
2 parents 23d6188 + e269b62
commit 071b5cc
Show file tree

Hide file tree

Showing 16 changed files with 115 additions and 47 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,4 +1,8 @@
-# V1.3.3 (2022-02-09)
+# Unreleased 
+
+- Added `meta_min_reads` parameter to control minimum coverage for metagene plots ([#155](https://github.com/smithlabcode/ribotricer/pull/155))
+
+# v1.3.3 (2022-02-09)
 
 - Print start codons when preparing orfs
 - Fix for custom start codons

diff --git a/ribotricer/__init__.py b/ribotricer/__init__.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 __author__ = "Saket Choudhary, Wenzheng Li"
-__version__ = "1.3.3"
+__version__ = "1.4.0-dev0"
diff --git a/ribotricer/cli.py b/ribotricer/cli.py
@@ -25,6 +25,7 @@
 from .const import MINIMUM_VALID_CODONS_RATIO
 from .const import MINIMUM_READS_PER_CODON
 from .const import MINIMUM_DENSITY_OVER_ORF
+from .const import META_MIN_READS
 
 from .count_orfs import count_orfs
 from .count_orfs import count_orfs_codon
@@ -49,7 +50,7 @@ def cli():
     pass
 
 
-###################### prepare-orfs function #########################################
+# prepare-orfs function #########################################
 @cli.command(
     "prepare-orfs",
     context_settings=CONTEXT_SETTINGS,
@@ -110,7 +111,7 @@ def prepare_orfs_cmd(
     prepare_orfs(gtf, fasta, prefix, min_orf_length, start_codons, stop_codons, longest)
 
 
-###################### detect-orfs function #########################################
+# detect-orfs function #########################################
 @cli.command(
     "detect-orfs",
     context_settings=CONTEXT_SETTINGS,
@@ -228,7 +229,7 @@ def detect_orfs_cmd(
     if read_lengths is not None:
         try:
             read_lengths = [int(x.strip()) for x in read_lengths.strip().split(",")]
-        except:
+        except Exception:
             sys.exit("Error: cannot convert read_lengths into integers")
         if not all([x > 0 for x in read_lengths]):
             sys.exit("Error: read length must be positive")
@@ -237,8 +238,10 @@ def detect_orfs_cmd(
         sys.exit("Error: psite_offsets only allowed when read_lengths is provided")
     if read_lengths is not None and psite_offsets is not None:
         try:
-            psite_offsets = [int(x.strip()) for x in psite_offsets.strip().split(",")]
-        except:
+            psite_offsets = [
+                int(x.strip()) for x in psite_offsets.strip().split(",")
+            ]
+        except Exception:
             sys.exit("Error: cannot convert psite_offsets into integers")
         if len(read_lengths) != len(psite_offsets):
             sys.exit("Error: psite_offsets must match read_lengths")
@@ -265,7 +268,7 @@ def detect_orfs_cmd(
     )
 
 
-###################### count-orfs function #########################################
+# count-orfs function #########################################
 @cli.command(
     "count-orfs",
     context_settings=CONTEXT_SETTINGS,
@@ -307,7 +310,7 @@ def count_orfs_cmd(ribotricer_index, detected_orfs, features, out, report_all):
     count_orfs(ribotricer_index, detected_orfs, features, out, report_all)
 
 
-###################### count-orfs-codon function #########################################
+# count-orfs-codon function #########################################
 @cli.command(
     "count-orfs-codon",
     context_settings=CONTEXT_SETTINGS,
@@ -367,7 +370,7 @@ def count_orfs_codon_cmd(
     )
 
 
-###################### orfs-seq function #########################################
+# orfs-seq function #########################################
 @cli.command(
     "orfs-seq",
     context_settings=CONTEXT_SETTINGS,
@@ -396,7 +399,7 @@ def orf_seq_cmd(ribotricer_index, fasta, saveto, protein):
     orf_seq(ribotricer_index, fasta, saveto, protein)
 
 
-###################### learn-cutoff function #########################################
+# learn-cutoff function #########################################
 @cli.command(
     "learn-cutoff",
     context_settings=CONTEXT_SETTINGS,

diff --git a/ribotricer/common.py b/ribotricer/common.py
@@ -50,7 +50,9 @@ def is_read_uniq_mapping(read):
             return False
         else:
             sys.stdout.write(
-                "WARNING: ribotricer was unable to detect any tags for determining multimapping status. All the reads will be treated as uniquely mapping\n"
+                "WARNING: ribotricer was unable to detect any tags for "
+                "determining multimapping status. All the reads will be "
+                "treated as uniquely mapping\n"
             )
 
 
@@ -76,7 +78,9 @@ def merge_intervals(intervals):
             intervals[i].end,
             intervals[i].strand,
         )
-        while i + 1 < len(intervals) and intervals[i + 1].start <= to_merge.end:
+        while (
+            i + 1 < len(intervals) and intervals[i + 1].start <= to_merge.end
+        ):
             to_merge.end = max(to_merge.end, intervals[i + 1].end)
             i += 1
         merged_intervals.append(to_merge)
@@ -124,6 +128,7 @@ def collapse_coverage_to_codon(coverage):
                     Coverage collapsed to codon level
     """
     codon_coverage = [
-        sum(coverage[current : current + 3]) for current in range(0, len(coverage), 3)
+        sum(coverage[current: current + 3])
+        for current in range(0, len(coverage), 3)
     ]
     return codon_coverage
diff --git a/ribotricer/const.py b/ribotricer/const.py
@@ -30,4 +30,4 @@
 # defined as the number of reads per unit length of the ORF
 MINIMUM_DENSITY_OVER_ORF = 0.0
 # Minimum number of reads for a read length to be considered
-META_MIN_READS=100000
+META_MIN_READS = 100000
diff --git a/ribotricer/count_orfs.py b/ribotricer/count_orfs.py
@@ -21,7 +21,9 @@
 import pandas as pd
 
 
-def count_orfs(ribotricer_index, detected_orfs, features, outfile, report_all=False):
+def count_orfs(
+    ribotricer_index, detected_orfs, features, outfile, report_all=False
+):
     """
     Parameters
     ----------
@@ -57,7 +59,11 @@ def count_orfs(ribotricer_index, detected_orfs, features, outfile, report_all=Fa
                 # do not output 'nontranslating' events unless report_all is set
                 if status != "nontranslating" or report_all:
                     intervals = orf_index[oid].intervals
-                    coor = [x for iv in intervals for x in range(iv.start, iv.end + 1)]
+                    coor = [
+                        x
+                        for iv in intervals
+                        for x in range(iv.start, iv.end + 1)
+                    ]
                     if strand == "-":
                         coor = coor[::-1]
                     profile_stripped = profile.strip()[1:-1].split(", ")
@@ -105,7 +111,9 @@ def count_orfs_codon(
                 if True, all coverages will be exported
     """
     orf_index = {}
-    fasta_df = pd.read_csv(ribotricer_index_fasta, sep="\t").set_index("ORF_ID")
+    fasta_df = pd.read_csv(ribotricer_index_fasta, sep="\t").set_index(
+        "ORF_ID"
+    )
     read_counts = defaultdict(dict)
     with open(ribotricer_index, "r") as fin:
         # Skip header
@@ -126,9 +134,15 @@ def count_orfs_codon(
                 # do not output 'nontranslating' events unless report_all is set
                 if status != "nontranslating" or report_all:
                     intervals = orf_index[oid].intervals
-                    coor = [x for iv in intervals for x in range(iv.start, iv.end + 1)]
+                    coor = [
+                        x
+                        for iv in intervals
+                        for x in range(iv.start, iv.end + 1)
+                    ]
                     codon_coor = [
-                        x for iv in intervals for x in range(iv.start, iv.end + 1, 3)
+                        x
+                        for iv in intervals
+                        for x in range(iv.start, iv.end + 1, 3)
                     ]
                     if strand == "-":
                         coor = coor[::-1]
@@ -158,7 +172,17 @@ def count_orfs_codon(
     # Output count table
     with open("{}_genewise.tsv".format(prefix), "w") as fout:
         fout.write(
-            "gene_id\tcodon\tvalues\tmean_codon_coverage\tmedian_codon_coverage\tvar_codon_coverage\tcodon_occurences\ttotal_codon_coverage\n"
+            "\t".join(
+                "gene_id",
+                "codon",
+                "values",
+                "mean_codon_coverage",
+                "median_codon_coverage",
+                "var_codon_coverage",
+                "codon_occurences",
+                "total_codon_coverage",
+            )
+            + "\n"
         )
         for gene_id, codon_seq in sorted(read_counts):
             values = list(read_counts[gene_id, codon_seq].values())
@@ -183,12 +207,16 @@ def count_orfs_codon(
     fout_df["per_codon_enrichment(total/n_occur)"] = (
         fout_df["total_codon_coverage"] / fout_df["codon_occurences"]
     )
-    fout_df["-log10_relative_enrichment(per_codon/total_gene_coverage)"] = -np.log10(
+    fout_df[
+        "-log10_relative_enrichment(per_codon/total_gene_coverage)"
+    ] = -np.log10(
         fout_df["per_codon_enrichment(total/n_occur)"]
         / fout_df.groupby("gene_id")["total_codon_coverage"].transform("sum")
     )
     # Overwrite
-    fout_df.to_csv("{}_genewise.tsv".format(prefix), sep="\t", index=False, header=True)
+    fout_df.to_csv(
+        "{}_genewise.tsv".format(prefix), sep="\t", index=False, header=True
+    )
     # Remove infs
     fout_df = fout_df.replace([np.inf, -np.inf], np.nan)
     fout_df = fout_df.dropna()

diff --git a/ribotricer/detect_orfs.py b/ribotricer/detect_orfs.py
@@ -174,7 +174,10 @@ def orf_coverage(orf, alignments, offset_5p=0, offset_3p=0):
                 except KeyError:
                     coverage.append(0)
             else:
-                if strand in alignments and (chrom, pos) in alignments[strand]:
+                if (
+                    strand in alignments
+                    and (chrom, pos) in alignments[strand]
+                ):
                     coverage.append(alignments[strand][(chrom, pos)])
                 else:
                     coverage.append(0)
@@ -266,7 +269,9 @@ def export_orf_coverages(
                 valid_codons_ratio = valid_codons / n_codons
                 # total reads in the ORF divided by the length
                 orf_density = np.sum(codon_coverage) / n_codons
-                codon_coverage_exceeds_min = codon_coverage >= min_reads_per_codon
+                codon_coverage_exceeds_min = (
+                    codon_coverage >= min_reads_per_codon
+                )
                 status = (
                     "translating"
                     if (
@@ -322,7 +327,9 @@ def export_wig(merged_alignments, prefix):
             if chrom != cur_chrom:
                 cur_chrom = chrom
                 to_write += "variableStep chrom={}\n".format(chrom)
-            to_write += "{}\t{}\n".format(pos, merged_alignments[strand][(chrom, pos)])
+            to_write += "{}\t{}\n".format(
+                pos, merged_alignments[strand][(chrom, pos)]
+            )
         if strand == "+":
             fname = "{}_pos.wig".format(prefix)
         else:
@@ -380,7 +387,11 @@ def detect_orfs(
 
     # parse the index file
     now = datetime.datetime.now()
-    print(now.strftime("%b %d %H:%M:%S ... started parsing ribotricer index file"))
+    print(
+        now.strftime(
+            "%b %d %H:%M:%S ... started parsing ribotricer index file"
+        )
+    )
     annotated, refseq = parse_ribotricer_index(ribotricer_index)
 
     # create directory
@@ -391,7 +402,8 @@ def detect_orfs(
         now = datetime.datetime.now()
         print(
             "{} ... {}".format(
-                now.strftime("%b %d %H:%M:%S"), "started inferring experimental design"
+                now.strftime("%b %d %H:%M:%S"),
+                "started inferring experimental design",
             )
         )
         protocol = infer_protocol(bam, refseq, prefix)
@@ -400,13 +412,16 @@ def detect_orfs(
     # split bam file into strand and read length
     now = datetime.datetime.now()
     print(now.strftime("%b %d %H:%M:%S ... started reading bam file"))
-    alignments, read_length_counts = split_bam(bam, protocol, prefix, read_lengths)
+    alignments, read_length_counts = split_bam(
+        bam, protocol, prefix, read_lengths
+    )
 
     # plot read length distribution
     now = datetime.datetime.now()
     print(
         "{} ... {}".format(
-            now.strftime("%b %d %H:%M:%S"), "started plotting read length distribution"
+            now.strftime("%b %d %H:%M:%S"),
+            "started plotting read length distribution",
         )
     )
     plot_read_lengths(read_length_counts, prefix)
@@ -419,13 +434,20 @@ def detect_orfs(
             "started calculating metagene profiles. This may take a long time...",
         )
     )
-    metagenes = metagene_coverage(annotated, alignments, read_length_counts, prefix, meta_min_reads = meta_min_reads)
+    metagenes = metagene_coverage(
+        annotated,
+        alignments,
+        read_length_counts,
+        prefix,
+        meta_min_reads=meta_min_reads,
+    )
 
     # plot metagene profiles
     now = datetime.datetime.now()
     print(
         "\n{} ... {}".format(
-            now.strftime("%b %d %H:%M:%S"), "started plotting metagene profiles"
+            now.strftime("%b %d %H:%M:%S"),
+            "started plotting metagene profiles",
         )
     )
     plot_metagene(metagenes, read_length_counts, prefix)
@@ -435,7 +457,8 @@ def detect_orfs(
         now = datetime.datetime.now()
         print(
             "{} ... {}".format(
-                now.strftime("%b %d %H:%M:%S"), "started inferring P-site offsets"
+                now.strftime("%b %d %H:%M:%S"),
+                "started inferring P-site offsets",
             )
         )
         psite_offsets = align_metagenes(

diff --git a/ribotricer/metagene.py b/ribotricer/metagene.py
@@ -281,7 +281,7 @@ def align_metagenes(
         xcorr = np.correlate(reference, cov, "full")
         origin = len(xcorr) // 2
         bound = min(base, length)
-        xcorr = xcorr[origin - bound : origin + bound]
+        xcorr = xcorr[(origin - bound):(origin + bound)]
         lag = np.argmax(xcorr) - len(xcorr) // 2
         psite_offsets[length] = lag + TYPICAL_OFFSET
         to_write += "\tlag of {}: {}\n".format(length, lag)

diff --git a/ribotricer/orf.py b/ribotricer/orf.py
@@ -89,7 +89,8 @@ def from_string(cls, line):
                 )
             )
             return None
-        oid = fields[0]
+        # ADS: oid below is not used
+        oid = fields[0]  # noqa F841
         category = fields[1]
         tid = fields[2]
         ttype = fields[3]

diff --git a/ribotricer/orf_seq.py b/ribotricer/orf_seq.py
@@ -92,7 +92,7 @@ def translate_nt_to_aa(seq):
     protein = ""
     if len(seq) % 3 == 0:
         for i in range(0, len(seq), 3):
-            codon = seq[i : i + 3]
+            codon = seq[i: i + 3]
             if "N" in codon:
                 protein += "X"
             elif codon not in codon_table:
@@ -142,10 +142,10 @@ def orf_seq(ribotricer_index, genome_fasta, saveto, translate=False):
             if translate:
                 if len(seq) % 3 != 0:
                     sys.stderr.write(
-                        "WARNING: Sequence length with ORF ID '{}' is not a multiple of three. Output sequence might be truncated.\n".format(
-                            orf_id
-                        )
+                        "WARNING: Sequence length with ORF ID '{orf_id}' is not "
+                        "a multiple of three. Output sequence might be "
+                        "truncated.\n"
                     )
-                    seq = seq[0 : (len(seq) // 3) * 3]
+                    seq = seq[0: (len(seq) // 3) * 3]
                 seq = translate_nt_to_aa(seq)
             fh.write("{}\t{}\n".format(orf_id, seq))