Improved tests and documentation.

legaultmarc · Oct 26, 2015 · 82814b6 · 82814b6
1 parent 787ee3f
commit 82814b6
Show file tree

Hide file tree

Showing 9 changed files with 232 additions and 23 deletions.
diff --git a/docs/db.rst b/docs/db.rst
@@ -40,3 +40,8 @@ as keys and lists of ``(position, file seek)`` as values.
 .. automodule:: gepyto.db.index
     :members:
 
+UCSC
+-----
+
+.. automodule:: gepyto.db.ucsc
+    :members:
diff --git a/docs/formats.rst b/docs/formats.rst
@@ -9,8 +9,19 @@ Impute2
     :members:
 
 SeqXML
----------
+-------
 
 .. automodule:: gepyto.formats.seqxml
     :members:
 
+GTF/GFF
+--------
+
+.. automodule:: gepyto.formats.gtf
+    :members:
+
+Wiggle (fixedStep)
+------------------
+
+.. automodule:: gepyto.formats.wig
+    :members:
diff --git a/gepyto/db/ucsc.py b/gepyto/db/ucsc.py
@@ -18,9 +18,6 @@
 __license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"
 
 
-__all__ = ["UCSC", ]
-
-
 import collections
 
 import numpy as np
@@ -161,6 +158,18 @@ def get_phylop_100_way(region):
     querying. This means that the user should use 1-based indexing, as
     usual when creating the Region object.
 
+    .. warning::
+
+        This function has a fairly low resolution. You should download the raw
+        data (e.g. from
+        `goldenpath <http://hgdownload.cse.ucsc.edu/goldenpath/hg19/phyloP100way/>`_
+        ) if you need scores for each base.
+        Also note that gepyto can't parse bigWig, but it can parse Wig files.
+
+    .. warning::
+
+        This function is **untested**.
+
     """
     with UCSC() as ucsc:
         sql = (

diff --git a/gepyto/formats/gtf.py b/gepyto/formats/gtf.py
@@ -36,7 +36,40 @@ def __str__(self):
 
 
 class GTFFile(object):
-    """Class representing a GTF file."""
+    """Parser for GTF files.
+
+    This implementation was based on the format specification as described
+    here: http://www.sanger.ac.uk/resources/software/gff/spec.html.
+
+    You can use this parser on both local files (compressed using gzip, or not)
+    and on remote files (on a HTTP server).
+
+    For every line, this class will return a named tuple with the following
+    fields:
+
+    - seqname
+    - source
+    - features
+    - start
+    - end
+    - score
+    - strand
+    - frame
+    - attributes
+
+    Example usage:
+
+    >>> import gepyto.formats.gtf
+    >>> url = "http://www.uniprot.org/uniprot/O60503.gff"
+    >>> gtf = gepyto.formats.gtf.GTFFile(url)
+    >>> gtf
+    <gepyto.formats.gtf.GTFFile object at 0x1006dd590>
+    >>> gtf.readline()
+    _Line(seqname=u'O60503', source=u'UniProtKB', features=u'Chain', start=1,\
+end=1353, score=None, strand=None, frame=None, attributes={u'Note':\
+u'Adenylate cyclase type 9', u'ID': u'PRO_0000195708'})
+
+    """
 
     Line = namedtuple("_Line", ["seqname", "source", "features", "start",
                                 "end", "score", "strand", "frame",

diff --git a/gepyto/formats/wig.py b/gepyto/formats/wig.py
@@ -20,18 +20,38 @@
 import pandas as pd
 import six
 
-from ..structures.region import _Segment, Region
-
-
 
 class WiggleFile(object):
     """Parser for WIG files.
-    
+
     This returns a pandas dataframe with all the necessary information. In the
     process, all the inherent compactness of the Wiggle format is lost in
     exchange for an easier to manage representation. This means that more
     efficient parsers should be used for large chunks of data.
 
+    This implementation is based on the specification from:
+    http://genome.ucsc.edu/goldenpath/help/wiggle.html
+
+    .. warning::
+        ``fixedStep`` is the only implemented mode for now. Future releases
+        might improve this parser to be more flexible.
+
+    To access the parsed information, use the
+    :py:func:`WiggleFile.as_dataframe` function.
+
+    Usage (given a file on disk):
+
+    >>> import gepyto.formats.wig
+    >>> with gepyto.formats.wig.WiggleFile("my_file.wig") as f:
+    ...     df = f.as_dataframe()
+    ...
+    >>> df
+      chrom     pos  value
+    0  chr3  400601     11
+    1  chr3  400701     22
+    2  chr3  400801     33
+
+
     """
     def __init__(self, stream):
         self.stream = stream
@@ -73,7 +93,6 @@ def close(self):
     def as_dataframe(self):
         return self.data
 
-
     def _parse_fixed_step(self, header=None):
         data = []
         for line in self.stream:
@@ -97,7 +116,6 @@ def _parse_fixed_step(self, header=None):
             columns=("chrom", "start", "end", "value")
         )
 
-
     @staticmethod
     def _parse_header(line):
         line = line.rstrip().split()

diff --git a/gepyto/structures/sequences.py b/gepyto/structures/sequences.py
@@ -14,7 +14,6 @@
                  "Lemieux Perreault. All rights reserved.")
 __license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"
 
-__all__ = ["Sequence", ]
 
 try:
     from string import maketrans, translate
@@ -271,6 +270,10 @@ def find_coding_sequences(self, cpu=6):
                   (ORF, start, end, sequence)
         :rtype: tuple
 
+        .. warning::
+
+            This is currently **untested**.
+
         """
         orfs = collections.OrderedDict([
             (1, self.seq),
@@ -428,9 +431,28 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
     penalized. Also, only one of the potentially many best alignments will be
     output.
 
-    This implementation is not very optimized. It is not written in a low level
-    language. It can be used for small sequences or for low number of
-    comparisons, but should not be used in large scale products.
+    .. warning::
+
+        This implementation is not very optimized. It is not written in a low
+        level language. It can be used for small sequences or for low number of
+        comparisons, but should not be used in large scale products.
+
+    .. note::
+
+        Some functionality like affine gap penalties, or substitution matrices
+        are not implemented.
+
+    The default penalty scheme is the following:
+
+    .. code-block:: python
+
+        {
+            "match": 2,
+            "mismatch": -1,
+            "gap": -1
+        }
+
+    You can follow this pattern to set your own penalty scores.
 
     """
     if isinstance(seq1, Sequence):
@@ -449,7 +471,7 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
     else:
         msg = "Invalid output mode '{}'. Accepted values are: {}."
         raise TypeError(
-            msg.format(output, ", ".join(SEQUENCES, ALIGNMENT))
+            msg.format(output, ", ".join([SEQUENCES, ALIGNMENT]))
         )
 
     # Default penalties.

diff --git a/gepyto/tests/test_formats.py b/gepyto/tests/test_formats.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from ..formats import impute2
+from ..formats import impute2, wig
 from .. import formats as fmts
 from ..structures.sequences import Sequence
 
@@ -336,6 +336,52 @@ def test_hard_call(self):
                     raise Exception()
 
 
+class TestWig(unittest.TestCase):
+    """Test for the WIG file parser."""
+    def test_ucsc_example_1(self):
+        expected = [
+            ("chr3", 400601, 11),
+            ("chr3", 400701, 22),
+            ("chr3", 400801, 33),
+        ]
+
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
+                         "33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertTrue(
+                    all([i == j for i, j in zip(line, expected[i])])
+                )
+
+    def test_ucsc_example_1_names(self):
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
+                         "33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertEqual(
+                    list(line.index),
+                    ["chrom", "pos", "value"]
+                )
+
+    def test_ucsc_example_2(self):
+        expected = [
+            ("chr3", 400601, 400605, 11),
+            ("chr3", 400701, 400705, 22),
+            ("chr3", 400801, 400805, 33),
+        ]
+
+        f = _create_file("fixedStep chrom=chr3 start=400601 step=100 span=5"
+                         "\n11\n22\n33\n")
+        with wig.WiggleFile(f.name) as wig_file:
+            df = wig_file.as_dataframe()
+            for i, line in df.iterrows():
+                self.assertTrue(
+                    all([i == j for i, j in zip(line, expected[i])])
+                )
+
+
 class TestGTF(unittest.TestCase):
     """Test the GTF file parser."""
     def setUp(self):
@@ -462,3 +508,11 @@ class TestGFF(TestGTF):
     """Test the GFF binding."""
     def setUp(self):
         self.cls = fmts.gff.GFFFile
+
+
+def _create_file(s):
+    """Create a temporary file and return its handle."""
+    f = tempfile.NamedTemporaryFile("w")
+    f.write(s)
+    f.seek(0)
+    return f
diff --git a/gepyto/tests/test_structures.py b/gepyto/tests/test_structures.py
@@ -37,10 +37,10 @@ def setUp(self):
 
         # Simple variations
         self.snp = variants.SNP(chrom="19", pos=55663495,
-                                       rs=self.snp_rs, ref="C", alt="T")
+                                rs=self.snp_rs, ref="C", alt="T")
+
         self.indel = variants.Indel(chrom="19", pos=55663539,
-                                           rs=self.indel_rs,
-                                           ref="TTC", alt="T")
+                                    rs=self.indel_rs, ref="TTC", alt="T")
 
         # Insertions
         self.insertion_rs = "rs11273285"
@@ -87,7 +87,10 @@ def test_indel(self):
 
     def test_insertion(self):
         indel = variants.Indel.from_ensembl_api(self.insertion_rs)
-        self.assertEqual(indel, [self.insertion])
+        # A new dbSNP build added new alternative alleles, so now we just
+        # test that the expected variant is in the returned list.
+        # This way we don't have to update the tests every time.
+        self.assertTrue(self.insertion in indel)
 
     def test_deletion(self):
         indel = variants.ShortVariant.from_ensembl_api(self.deletion_rs)
@@ -169,6 +172,7 @@ def test_df(self):
             [snp1, snp2, indel]
         )
 
+
 class TestGene(unittest.TestCase):
     """Tests for the struct.genes module.
 
@@ -251,3 +255,49 @@ def test_reverse_complement(self):
         seq = sequences.Sequence("test", "TAGTVTAMCTATK", "DNA")
         expected = "MATAGKTABACTA"
         self.assertEqual(seq.reverse_complement().seq, expected)
+
+    def test_local_alignment_score(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "AAAAAAAAAGTGTAAAAAAA",
+            "AGTGT"
+        )
+        # AAAAAAAAAGTGTAAAAAAA
+        #         AGTGT
+        # 5 matches, free rides on both sides.
+        self.assertEqual(score, 2 * 5)
+
+    def test_local_alignment_sequences(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "AAAAAAAAAGTGTAAAAAAA",
+            "AGTGT"
+        )
+        # AAAAAAAAAGTGTAAAAAAA
+        #         AGTGT
+        # s1 = s1
+        # s2 = --------AGTGT-------
+        self.assertEqual(s1, "AAAAAAAAAGTGTAAAAAAA")
+        self.assertEqual(s2, "--------AGTGT-------")
+
+    def test_alignment_gap(self):
+        score, s1, s2 = sequences.smith_waterman(
+            "GTCGTCATGA",
+            "GTCTGAT"
+        )
+        # GTCGTCATGA
+        #    GTC TGAT
+        #    MMMIMMMD = 2 * 6 - 1 = 11 - 1
+        self.assertEqual(score, 11)
+        self.assertEqual(s1, "GTCGTCATGA-")
+        self.assertEqual(s2, "---GTC-TGAT")
+
+    def test_alignment_return_alignment(self):
+        score, alg = sequences.smith_waterman(
+            "GTCGTCATGA",
+            "GTCTGAT",
+            output="alignment"
+        )
+        self.assertEqual(alg, "IIIMMMIMMMD")
+
+    def test_alignment_weird_output(self):
+        with self.assertRaises(TypeError):
+            sequences.smith_waterman("ABC", "ABC", output="potato")