Skip to content

Commit

Permalink
Improved tests and documentation.
Browse files Browse the repository at this point in the history
  • Loading branch information
legaultmarc committed Oct 26, 2015
1 parent 787ee3f commit 82814b6
Show file tree
Hide file tree
Showing 9 changed files with 232 additions and 23 deletions.
5 changes: 5 additions & 0 deletions docs/db.rst
Expand Up @@ -40,3 +40,8 @@ as keys and lists of ``(position, file seek)`` as values.
.. automodule:: gepyto.db.index
:members:

UCSC
-----

.. automodule:: gepyto.db.ucsc
:members:
13 changes: 12 additions & 1 deletion docs/formats.rst
Expand Up @@ -9,8 +9,19 @@ Impute2
:members:

SeqXML
---------
-------

.. automodule:: gepyto.formats.seqxml
:members:

GTF/GFF
--------

.. automodule:: gepyto.formats.gtf
:members:

Wiggle (fixedStep)
------------------

.. automodule:: gepyto.formats.wig
:members:
15 changes: 12 additions & 3 deletions gepyto/db/ucsc.py
Expand Up @@ -18,9 +18,6 @@
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


__all__ = ["UCSC", ]


import collections

import numpy as np
Expand Down Expand Up @@ -161,6 +158,18 @@ def get_phylop_100_way(region):
querying. This means that the user should use 1-based indexing, as
usual when creating the Region object.
.. warning::
This function has a fairly low resolution. You should download the raw
data (e.g. from
`goldenpath <http://hgdownload.cse.ucsc.edu/goldenpath/hg19/phyloP100way/>`_
) if you need scores for each base.
Also note that gepyto can't parse bigWig, but it can parse Wig files.
.. warning::
This function is **untested**.
"""
with UCSC() as ucsc:
sql = (
Expand Down
35 changes: 34 additions & 1 deletion gepyto/formats/gtf.py
Expand Up @@ -36,7 +36,40 @@ def __str__(self):


class GTFFile(object):
"""Class representing a GTF file."""
"""Parser for GTF files.
This implementation was based on the format specification as described
here: http://www.sanger.ac.uk/resources/software/gff/spec.html.
You can use this parser on both local files (compressed using gzip, or not)
and on remote files (on a HTTP server).
For every line, this class will return a named tuple with the following
fields:
- seqname
- source
- features
- start
- end
- score
- strand
- frame
- attributes
Example usage:
>>> import gepyto.formats.gtf
>>> url = "http://www.uniprot.org/uniprot/O60503.gff"
>>> gtf = gepyto.formats.gtf.GTFFile(url)
>>> gtf
<gepyto.formats.gtf.GTFFile object at 0x1006dd590>
>>> gtf.readline()
_Line(seqname=u'O60503', source=u'UniProtKB', features=u'Chain', start=1,\
end=1353, score=None, strand=None, frame=None, attributes={u'Note':\
u'Adenylate cyclase type 9', u'ID': u'PRO_0000195708'})
"""

Line = namedtuple("_Line", ["seqname", "source", "features", "start",
"end", "score", "strand", "frame",
Expand Down
30 changes: 24 additions & 6 deletions gepyto/formats/wig.py
Expand Up @@ -20,18 +20,38 @@
import pandas as pd
import six

from ..structures.region import _Segment, Region



class WiggleFile(object):
"""Parser for WIG files.
This returns a pandas dataframe with all the necessary information. In the
process, all the inherent compactness of the Wiggle format is lost in
exchange for an easier to manage representation. This means that more
efficient parsers should be used for large chunks of data.
This implementation is based on the specification from:
http://genome.ucsc.edu/goldenpath/help/wiggle.html
.. warning::
``fixedStep`` is the only implemented mode for now. Future releases
might improve this parser to be more flexible.
To access the parsed information, use the
:py:func:`WiggleFile.as_dataframe` function.
Usage (given a file on disk):
>>> import gepyto.formats.wig
>>> with gepyto.formats.wig.WiggleFile("my_file.wig") as f:
... df = f.as_dataframe()
...
>>> df
chrom pos value
0 chr3 400601 11
1 chr3 400701 22
2 chr3 400801 33
"""
def __init__(self, stream):
self.stream = stream
Expand Down Expand Up @@ -73,7 +93,6 @@ def close(self):
def as_dataframe(self):
return self.data


def _parse_fixed_step(self, header=None):
data = []
for line in self.stream:
Expand All @@ -97,7 +116,6 @@ def _parse_fixed_step(self, header=None):
columns=("chrom", "start", "end", "value")
)


@staticmethod
def _parse_header(line):
line = line.rstrip().split()
Expand Down
32 changes: 27 additions & 5 deletions gepyto/structures/sequences.py
Expand Up @@ -14,7 +14,6 @@
"Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"

__all__ = ["Sequence", ]

try:
from string import maketrans, translate
Expand Down Expand Up @@ -271,6 +270,10 @@ def find_coding_sequences(self, cpu=6):
(ORF, start, end, sequence)
:rtype: tuple
.. warning::
This is currently **untested**.
"""
orfs = collections.OrderedDict([
(1, self.seq),
Expand Down Expand Up @@ -428,9 +431,28 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
penalized. Also, only one of the potentially many best alignments will be
output.
This implementation is not very optimized. It is not written in a low level
language. It can be used for small sequences or for low number of
comparisons, but should not be used in large scale products.
.. warning::
This implementation is not very optimized. It is not written in a low
level language. It can be used for small sequences or for low number of
comparisons, but should not be used in large scale products.
.. note::
Some functionality like affine gap penalties, or substitution matrices
are not implemented.
The default penalty scheme is the following:
.. code-block:: python
{
"match": 2,
"mismatch": -1,
"gap": -1
}
You can follow this pattern to set your own penalty scores.
"""
if isinstance(seq1, Sequence):
Expand All @@ -449,7 +471,7 @@ def smith_waterman(seq1, seq2, penalties=None, output="sequences"):
else:
msg = "Invalid output mode '{}'. Accepted values are: {}."
raise TypeError(
msg.format(output, ", ".join(SEQUENCES, ALIGNMENT))
msg.format(output, ", ".join([SEQUENCES, ALIGNMENT]))
)

# Default penalties.
Expand Down
56 changes: 55 additions & 1 deletion gepyto/tests/test_formats.py
Expand Up @@ -20,7 +20,7 @@

import numpy as np

from ..formats import impute2
from ..formats import impute2, wig
from .. import formats as fmts
from ..structures.sequences import Sequence

Expand Down Expand Up @@ -336,6 +336,52 @@ def test_hard_call(self):
raise Exception()


class TestWig(unittest.TestCase):
"""Test for the WIG file parser."""
def test_ucsc_example_1(self):
expected = [
("chr3", 400601, 11),
("chr3", 400701, 22),
("chr3", 400801, 33),
]

f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
"33\n")
with wig.WiggleFile(f.name) as wig_file:
df = wig_file.as_dataframe()
for i, line in df.iterrows():
self.assertTrue(
all([i == j for i, j in zip(line, expected[i])])
)

def test_ucsc_example_1_names(self):
f = _create_file("fixedStep chrom=chr3 start=400601 step=100\n11\n22\n"
"33\n")
with wig.WiggleFile(f.name) as wig_file:
df = wig_file.as_dataframe()
for i, line in df.iterrows():
self.assertEqual(
list(line.index),
["chrom", "pos", "value"]
)

def test_ucsc_example_2(self):
expected = [
("chr3", 400601, 400605, 11),
("chr3", 400701, 400705, 22),
("chr3", 400801, 400805, 33),
]

f = _create_file("fixedStep chrom=chr3 start=400601 step=100 span=5"
"\n11\n22\n33\n")
with wig.WiggleFile(f.name) as wig_file:
df = wig_file.as_dataframe()
for i, line in df.iterrows():
self.assertTrue(
all([i == j for i, j in zip(line, expected[i])])
)


class TestGTF(unittest.TestCase):
"""Test the GTF file parser."""
def setUp(self):
Expand Down Expand Up @@ -462,3 +508,11 @@ class TestGFF(TestGTF):
"""Test the GFF binding."""
def setUp(self):
self.cls = fmts.gff.GFFFile


def _create_file(s):
"""Create a temporary file and return its handle."""
f = tempfile.NamedTemporaryFile("w")
f.write(s)
f.seek(0)
return f
58 changes: 54 additions & 4 deletions gepyto/tests/test_structures.py
Expand Up @@ -37,10 +37,10 @@ def setUp(self):

# Simple variations
self.snp = variants.SNP(chrom="19", pos=55663495,
rs=self.snp_rs, ref="C", alt="T")
rs=self.snp_rs, ref="C", alt="T")

self.indel = variants.Indel(chrom="19", pos=55663539,
rs=self.indel_rs,
ref="TTC", alt="T")
rs=self.indel_rs, ref="TTC", alt="T")

# Insertions
self.insertion_rs = "rs11273285"
Expand Down Expand Up @@ -87,7 +87,10 @@ def test_indel(self):

def test_insertion(self):
indel = variants.Indel.from_ensembl_api(self.insertion_rs)
self.assertEqual(indel, [self.insertion])
# A new dbSNP build added new alternative alleles, so now we just
# test that the expected variant is in the returned list.
# This way we don't have to update the tests every time.
self.assertTrue(self.insertion in indel)

def test_deletion(self):
indel = variants.ShortVariant.from_ensembl_api(self.deletion_rs)
Expand Down Expand Up @@ -169,6 +172,7 @@ def test_df(self):
[snp1, snp2, indel]
)


class TestGene(unittest.TestCase):
"""Tests for the struct.genes module.
Expand Down Expand Up @@ -251,3 +255,49 @@ def test_reverse_complement(self):
seq = sequences.Sequence("test", "TAGTVTAMCTATK", "DNA")
expected = "MATAGKTABACTA"
self.assertEqual(seq.reverse_complement().seq, expected)

def test_local_alignment_score(self):
score, s1, s2 = sequences.smith_waterman(
"AAAAAAAAAGTGTAAAAAAA",
"AGTGT"
)
# AAAAAAAAAGTGTAAAAAAA
# AGTGT
# 5 matches, free rides on both sides.
self.assertEqual(score, 2 * 5)

def test_local_alignment_sequences(self):
score, s1, s2 = sequences.smith_waterman(
"AAAAAAAAAGTGTAAAAAAA",
"AGTGT"
)
# AAAAAAAAAGTGTAAAAAAA
# AGTGT
# s1 = s1
# s2 = --------AGTGT-------
self.assertEqual(s1, "AAAAAAAAAGTGTAAAAAAA")
self.assertEqual(s2, "--------AGTGT-------")

def test_alignment_gap(self):
score, s1, s2 = sequences.smith_waterman(
"GTCGTCATGA",
"GTCTGAT"
)
# GTCGTCATGA
# GTC TGAT
# MMMIMMMD = 2 * 6 - 1 = 11 - 1
self.assertEqual(score, 11)
self.assertEqual(s1, "GTCGTCATGA-")
self.assertEqual(s2, "---GTC-TGAT")

def test_alignment_return_alignment(self):
score, alg = sequences.smith_waterman(
"GTCGTCATGA",
"GTCTGAT",
output="alignment"
)
self.assertEqual(alg, "IIIMMMIMMMD")

def test_alignment_weird_output(self):
with self.assertRaises(TypeError):
sequences.smith_waterman("ABC", "ABC", output="potato")

0 comments on commit 82814b6

Please sign in to comment.