From 82814b6d5a990bc5f2d0ddf4e1748bd13fa99270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Legault?= <legaultmarc@gmail.com>
Date: Mon, 26 Oct 2015 15:52:44 -0400
Subject: [PATCH] Improved tests and documentation.

---
 docs/db.rst                       |  5 +++
 docs/formats.rst                  | 13 ++++++-
 gepyto/db/ucsc.py                 | 15 ++++++--
 gepyto/formats/gtf.py             | 35 ++++++++++++++++++-
 gepyto/formats/wig.py             | 30 ++++++++++++----
 gepyto/structures/sequences.py    | 32 ++++++++++++++---
 gepyto/tests/test_formats.py      | 56 ++++++++++++++++++++++++++++-
 gepyto/tests/test_structures.py   | 58 ++++++++++++++++++++++++++++---
 gepyto/visualization/ideograms.py | 11 ++++--
 9 files changed, 232 insertions(+), 23 deletions(-)

diff --git a/docs/db.rst b/docs/db.rst
index e32a6fc..48a2247 100644
--- a/docs/db.rst
+++ b/docs/db.rst
@@ -40,3 +40,8 @@ as keys and lists of ``(position, file seek)`` as values.
 .. automodule:: gepyto.db.index
     :members:
 
+UCSC
+-----
+
+.. automodule:: gepyto.db.ucsc
+    :members:
diff --git a/docs/formats.rst b/docs/formats.rst
index 54b54f3..f7fc724 100644
--- a/docs/formats.rst
+++ b/docs/formats.rst
@@ -9,8 +9,19 @@ Impute2
     :members:
 
 SeqXML
----------
+-------
 
 .. automodule:: gepyto.formats.seqxml
     :members:
 
+GTF/GFF
+--------
+
+.. automodule:: gepyto.formats.gtf
+    :members:
+
+Wiggle (fixedStep)
+------------------
+
+.. automodule:: gepyto.formats.wig
+    :members:
diff --git a/gepyto/db/ucsc.py b/gepyto/db/ucsc.py
index 7a2f46b..22986e2 100644
--- a/gepyto/db/ucsc.py
+++ b/gepyto/db/ucsc.py
@@ -18,9 +18,6 @@
 __license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"
 
 
-__all__ = ["UCSC", ]
-
-
 import collections
 
 import numpy as np
@@ -161,6 +158,18 @@ def get_phylop_100_way(region):
     querying. This means that the user should use 1-based indexing, as
     usual when creating the Region object.
 
+    .. warning::
+
+        This function has a fairly low resolution. You should download the raw
+        data (e.g. from
+        `goldenpath <http://hgdownload.cse.ucsc.edu/goldenpath/hg19/phyloP100way/>`_
+        ) if you need scores for each base.
+        Also note that gepyto can't parse bigWig, but it can parse Wig files.
+
+    .. warning::
+
+        This function is **untested**.
+
     """
     with UCSC() as ucsc:
         sql = (
diff --git a/gepyto/formats/gtf.py b/gepyto/formats/gtf.py
index 1d73eda..3c2ee62 100644
--- a/gepyto/formats/gtf.py
+++ b/gepyto/formats/gtf.py
@@ -36,7 +36,40 @@ def __str__(self):
 
 
 class GTFFile(object):
-    """Class representing a GTF file."""
+    """Parser for GTF files.
+
+    This implementation was based on the format specification as described
+    here: http://www.sanger.ac.uk/resources/software/gff/spec.html.
+
+    You can use this parser on both local files (compressed using gzip, or not)
+    and on remote files (on a HTTP server).
+
+    For every line, this class will return a named tuple with the following
+    fields:
+
+    - seqname
+    - source
+    - features
+    - start
+    - end
+    - score
+    - strand
+    - frame
+    - attributes
+
+    Example usage:
+
+    >>> import gepyto.formats.gtf
+    >>> url = "http://www.uniprot.org/uniprot/O60503.gff"
+    >>> gtf = gepyto.formats.gtf.GTFFile(url)
+    >>> gtf
+    <gepyto.formats.gtf.GTFFile object at 0x1006dd590>
+    >>> gtf.readline()
+    _Line(seqname=u'O60503', source=u'UniProtKB', features=u'Chain', start=1,\
+end=1353, score=None, strand=None, frame=None, attributes={u'Note':\
+u'Adenylate cyclase type 9', u'ID': u'PRO_0000195708'})
+
+    """
 
     Line = namedtuple("_Line", ["seqname", "source", "features", "start",
                                 "end", "score", "strand", "frame",
diff --git a/gepyto/formats/wig.py b/gepyto/formats/wig.py
index 32411c1..0d1f5cd 100644
--- a/gepyto/formats/wig.py
+++ b/gepyto/formats/wig.py
@@ -20,18 +20,38 @@
 import pandas as pd
 import six
 
-from ..structures.region import _Segment, Region
-
-
 
 class WiggleFile(object):
     """Parser for WIG files.
-    
+
     This returns a pandas dataframe with all the necessary information. In the
     process, all the inherent compactness of the Wiggle format is lost in
     exchange for an easier to manage representation. This means that more
     efficient parsers should be used for large chunks of data.
 
+    This implementation is based on the specification from:
+    http://genome.ucsc.edu/goldenpath/help/wiggle.html
+
+    .. warning::
+        ``fixedStep`` is the only implemented mode for now. Future releases
+        might improve this parser to be more flexible.
+
+    To access the parsed information, use the
+    :py:func:`WiggleFile.as_dataframe` function.
+
+    Usage (given a file on disk):
+
+    >>> import gepyto.formats.wig
+    >>> with gepyto.formats.wig.WiggleFile("my_file.wig") as f:
+    ...     df = f.as_dataframe()
+    ...
+    >>> df
+      chrom     pos  value
+    0  chr3  400601     11
+    1  chr3  400701     22
+    2  chr3  400801     33
+
+
     """
     def __init__(self, stream):
         self.stream = stream
@@ -73,7 +93,6 @@ def close(self):
     def as_dataframe(self):
         return self.data
 
-
     def _parse_fixed_step(self, header=None):
         data = []
         for line in self.stream:
@@ -97,7 +116,6 @@ def _parse_fixed_step(self, header=None):
             columns=("chrom", "start", "end", "value")
         )
 
-
     @staticmethod
     def _parse_header(line):
         line = line.rstrip().split()
diff --git a/gepyto/structures/sequences.py b/gepyto/structures/sequences.py
index 09548f0..4270271 100644
--- a/gepyto/structures/sequences.py
+++ b/gepyto/structures/sequences.py
@@ -14,7 +14,6 @@
                  "Lemieux Perreault. All rights reserved.")
 __license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"
 
-__all__ = ["Sequence", ]
 
 try:
     from string import maketrans, translate
@@ -271,6 +270,10 @@ def find_coding_sequences(self, cpu=6):
                   (ORF, start, end, sequence)
         :rtype: tuple
 
+        .. warning::
+
+            This is currently **untested**.
+
         """
         orfs = collections.OrderedDict([
             (1, self.seq),
@@ -428,9 +431,28 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
     penalized. Also, only one of the potentially many best alignments will be
     output.
 
-    This implementation is not very optimized. It is not written in a low level
-    language. It can be used for small sequences or for low number of
-    comparisons, but should not be used in large scale products.
+    .. warning::
+
+        This implementation is not very optimized. It is not written in a low
+        level language. It can be used for small sequences or for low number of
+        comparisons, but should not be used in large scale products.
+
+    .. note::
+
+        Some functionality like affine gap penalties, or substitution matrices
+        are not implemented.
+
+    The default penalty scheme is the following:
+
+    .. code-block:: python
+
+        {
+            "match": 2,
+            "mismatch": -1,
+            "gap": -1
+        }
+
+    You can follow this pattern to set your own penalty scores.
 
     """
     if isinstance(seq1, Sequence):
@@ -449,7 +471,7 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
     else:
         msg = "Invalid output mode '{}'. Accepted values are: {}."
         raise TypeError(
-            msg.format(output, ", ".join(SEQUENCES, ALIGNMENT))
+            msg.format(output, ", ".join([SEQUENCES, ALIGNMENT]))
         )
 
     # Default penalties.
diff --git a/gepyto/tests/test_formats.py b/gepyto/tests/test_formats.py
index 05e6955..b50b958 100644
--- a/gepyto/tests/test_formats.py
+++ b/gepyto/tests/test_formats.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from ..formats import impute2
+from ..formats import impute2, wig
 from .. import formats as fmts
 from ..structures.sequences import Sequence
 
@@ -336,6 +336,52 @@ def test_hard_call(self):
                     raise Exception()
 
 
+class TestWig(unittest.TestCase):
+    """Test for the WIG file parser."""
+    def test_ucsc_example_1(self):
+        expected = [
+            ("chr3", 400601, 11),
+            ("chr3", 400701, 22),
+            ("chr3", 400801, 33),
+        ]
+
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
+                         "33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertTrue(
+                    all([i == j for i, j in zip(line, expected[i])])
+                )
+
+    def test_ucsc_example_1_names(self):
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
+                         "33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertEqual(
+                    list(line.index),
+                    ["chrom", "pos", "value"]
+                )
+
+    def test_ucsc_example_2(self):
+        expected = [
+            ("chr3", 400601, 400605, 11),
+            ("chr3", 400701, 400705, 22),
+            ("chr3", 400801, 400805, 33),
+        ]
+
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100 span=5"
+                         "\n11\n22\n33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertTrue(
+                    all([i == j for i, j in zip(line, expected[i])])
+                )
+
+
 class TestGTF(unittest.TestCase):
     """Test the GTF file parser."""
     def setUp(self):
@@ -462,3 +508,11 @@ class TestGFF(TestGTF):
     """Test the GFF binding."""
     def setUp(self):
         self.cls = fmts.gff.GFFFile
+
+
+def _create_file(s):
+    """Create a temporary file and return its handle."""
+    f = tempfile.NamedTemporaryFile("w")
+    f.write(s)
+    f.seek(0)
+    return f
diff --git a/gepyto/tests/test_structures.py b/gepyto/tests/test_structures.py
index cd61c1a..d7a477f 100644
--- a/gepyto/tests/test_structures.py
+++ b/gepyto/tests/test_structures.py
@@ -37,10 +37,10 @@ def setUp(self):
 
         # Simple variations
         self.snp = variants.SNP(chrom="19", pos=55663495,
-                                       rs=self.snp_rs, ref="C", alt="T")
+                                rs=self.snp_rs, ref="C", alt="T")
+
         self.indel = variants.Indel(chrom="19", pos=55663539,
-                                           rs=self.indel_rs,
-                                           ref="TTC", alt="T")
+                                    rs=self.indel_rs, ref="TTC", alt="T")
 
         # Insertions
         self.insertion_rs = "rs11273285"
@@ -87,7 +87,10 @@ def test_indel(self):
 
     def test_insertion(self):
         indel = variants.Indel.from_ensembl_api(self.insertion_rs)
-        self.assertEqual(indel, [self.insertion])
+        # A new dbSNP build added new alternative alleles, so now we just
+        # test that the expected variant is in the returned list.
+        # This way we don't have to update the tests every time.
+        self.assertTrue(self.insertion in indel)
 
     def test_deletion(self):
         indel = variants.ShortVariant.from_ensembl_api(self.deletion_rs)
@@ -169,6 +172,7 @@ def test_df(self):
             [snp1, snp2, indel]
         )
 
+
 class TestGene(unittest.TestCase):
     """Tests for the struct.genes module.
 
@@ -251,3 +255,49 @@ def test_reverse_complement(self):
         seq = sequences.Sequence("test", "TAGTVTAMCTATK", "DNA")
         expected = "MATAGKTABACTA"
         self.assertEqual(seq.reverse_complement().seq, expected)
+
+    def test_local_alignment_score(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "AAAAAAAAAGTGTAAAAAAA",
+            "AGTGT"
+        )
+        # AAAAAAAAAGTGTAAAAAAA
+        #         AGTGT
+        # 5 matches, free rides on both sides.
+        self.assertEqual(score, 2 * 5)
+
+    def test_local_alignment_sequences(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "AAAAAAAAAGTGTAAAAAAA",
+            "AGTGT"
+        )
+        # AAAAAAAAAGTGTAAAAAAA
+        #         AGTGT
+        # s1 = s1
+        # s2 = --------AGTGT-------
+        self.assertEqual(s1, "AAAAAAAAAGTGTAAAAAAA")
+        self.assertEqual(s2, "--------AGTGT-------")
+
+    def test_alignment_gap(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "GTCGTCATGA",
+            "GTCTGAT"
+        )
+        # GTCGTCATGA
+        #    GTC TGAT
+        #    MMMIMMMD = 2 * 6 - 1 = 11 - 1
+        self.assertEqual(score, 11)
+        self.assertEqual(s1, "GTCGTCATGA-")
+        self.assertEqual(s2, "---GTC-TGAT")
+
+    def test_alignment_return_alignment(self):
+        score, alg = sequences.smith_waterman(
+            "GTCGTCATGA",
+            "GTCTGAT",
+            output="alignment"
+        )
+        self.assertEqual(alg, "IIIMMMIMMMD")
+
+    def test_alignment_weird_output(self):
+        with self.assertRaises(TypeError):
+            sequences.smith_waterman("ABC", "ABC", output="potato")
diff --git a/gepyto/visualization/ideograms.py b/gepyto/visualization/ideograms.py
index 30a93bc..6e54e37 100644
--- a/gepyto/visualization/ideograms.py
+++ b/gepyto/visualization/ideograms.py
@@ -1,4 +1,3 @@
-
 # Utilities to handle chromosome ideogram plotting.
 
 # This file is part of gepyto.
@@ -12,7 +11,6 @@
 import operator
 from collections import namedtuple
 
-import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 
 from ..db import ucsc
@@ -41,6 +39,11 @@ def plot_chr_ideogram(ax, chrom, loc="bottom", cyto_color=_ucsc_cytoband_color,
                       proportion=0.05):
     """Plot the chromosome ideogram at the bottom of the axe.
 
+    This function can be used if you want to plot cytobands corresponding to a
+    locus. As an example, if you want to display association p-values and the
+    corresponding cytoband on the same plot, you could use this function. Note
+    that the axis should correspond to the chromosomal position.
+
     :param ax: the axe that will contain the genes in the region.
     :type ax: :py:class:`matplotlib.axes.Axes`
 
@@ -69,6 +72,10 @@ def plot_chr_ideogram(ax, chrom, loc="bottom", cyto_color=_ucsc_cytoband_color,
         right, the width is determined as
         :math:`(x_{max} - x_{min}) * proportion`.
 
+    .. warning::
+
+        This function is not covered by unittests.
+
     """
     # Checking the location value
     assert loc in {"bottom", "top", "left", "right"}, "invalid location"