diff --git a/.gitignore b/.gitignore index bb6d856..833249c 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ htmlcov .ipynb_checkpoints build +_build diff --git a/.travis.yml b/.travis.yml index 33cf18c..7668ff0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ install: - pip install six - pip install numpy - pip install pandas + - pip install -U mock - pip install coveralls script: - coverage run setup.py test diff --git a/README.mkd b/README.mkd index d9d0e65..08c125d 100644 --- a/README.mkd +++ b/README.mkd @@ -5,7 +5,9 @@ # pyplink - Module to process Plink's binary files -`PyPlink` is a Python module to read and write Plink's binary files. +`PyPlink` is a Python module to read and write Plink's binary files. Short +documentation available at +[https://lemieuxl.github.io/pyplink/](https://lemieuxl.github.io/pyplink/). ## Dependencies @@ -67,12 +69,11 @@ conda update pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink To test the module, just perform the following command: -```python ->>> import pyplink ->>> pyplink.test() +```console +$ python -m pyplink.tests ............................................. ---------------------------------------------------------------------- -Ran 45 tests in 0.480s +Ran 45 tests in 0.334s OK ``` diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..bcc041b --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = pyplink +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..b2a1082 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# pyplink documentation build configuration file, created by +# sphinx-quickstart on Fri Mar 10 09:00:31 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'pyplink' +copyright = '2017, Louis-Philippe Lemieux Perreault' +author = 'Louis-Philippe Lemieux Perreault' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pyplink +version = ".".join(pyplink.__version__.split(".")[:-1]) +release = pyplink.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. + +html_theme_options = { + 'github_user': 'lemieuxl', + 'github_repo': 'pyplink', + 'github_button': False, + 'fixed_sidebar': True, +} + +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'relations.html', + 'searchbox.html', + 'donate.html', + ] +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pyplinkdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pyplink.tex', 'pyplink Documentation', + 'Louis-Philippe Lemieux Perreault', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pyplink', 'pyplink Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pyplink', 'pyplink Documentation', + author, 'pyplink', 'One line description of project.', + 'Miscellaneous'), +] + + + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'https://docs.python.org/3': None, + 'numpy': ('http://docs.scipy.org/doc/numpy/', None), + 'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None), +} diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..b9aca61 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,39 @@ +.. pyplink documentation master file, created by + sphinx-quickstart on Fri Mar 10 09:00:31 2017. + +pyplink +======== + +:py:mod:`pyplink` is a Python (2 and 3) binary Plink file parser and writer +released under the MIT licence. The difference with existing parsers (and Plink +itself) is that :py:mod:`pyplink` does not load the BED file in memory, making +possible to work with extremely large files (*e.g.* 1000 Genomes Phase 3 +files). + +For more information on how to use :py:mod:`pyplink`, refer to the +:doc:`API documentation `. Below is a snippet describing the most +common usage of the module. + + +.. code-block:: python + + from pyplink import PyPlink + + with PyPlink("plink_file_prefix") as bed: + # Getting the BIM and FAM + bim = bed.get_bim() + fam = bed.get_fam() + + # Iterating over all loci + for loci_name, genotypes in bed: + pass + + # Getting the genotypes of a single marker (numpy.ndarray) + genotypes = bed.get_geno_marker("rs12345") + + +.. toctree:: + :hidden: + + installation + pyplink diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..c7951dc --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,30 @@ + +Installation +============= + +You can install :py:mod:`pyplink` using either ``pip`` or ``conda``. + +.. code-block:: bash + + # Using pip + pip install pyplink + + +.. code-block:: bash + + # Using conda + conda config --add channels http://statgen.org/wp-content/uploads/Softwares/pyplink + conda install pyplink + + +To test the installation, run ``python`` and execute the following commands. + +.. code-block:: python + + >>> import pyplink + >>> pyplink.test() + ............................................. + ---------------------------------------------------------------------- + Ran 45 tests in 0.343s + + OK diff --git a/docs/pyplink.rst b/docs/pyplink.rst new file mode 100644 index 0000000..f9315a3 --- /dev/null +++ b/docs/pyplink.rst @@ -0,0 +1,11 @@ + +pyplink's API +============== + +.. autoclass:: pyplink.PyPlink + :members: + + +.. automodule:: pyplink + :members: + diff --git a/pyplink/__init__.py b/pyplink/__init__.py index 39c8ed0..dc8f1b0 100644 --- a/pyplink/__init__.py +++ b/pyplink/__init__.py @@ -25,7 +25,7 @@ # THE SOFTWARE. -from .pyplink import * +from .pyplink import PyPlink try: from .version import pyplink_version as __version__ @@ -46,7 +46,7 @@ def test(verbosity=1): # pragma: no cover """Executes all the tests for pyplink. Args: - verbosity (int): the verbosity level for unittest + verbosity (int): The verbosity level for :py:mod:`unittest`. Just set ``verbosity`` to an integer higher than ``1`` to have more information about the tests. diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index f09e5ad..932d6c5 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -26,7 +26,9 @@ import os -import warnings +import logging +from itertools import repeat +from collections import Counter from io import UnsupportedOperation try: @@ -48,8 +50,8 @@ __all__ = ["PyPlink"] -# Allowing for warnings -warnings.simplefilter("once", DeprecationWarning) +# The logger +logger = logging.getLogger(__name__) # The recoding values @@ -61,7 +63,28 @@ class PyPlink(object): - """Reads and store a set of binary Plink files.""" + """Reads and store a set of binary Plink files. + + Args: + prefix (str): The prefix of the binary Plink files. + mode (str): The open mode for the binary Plink file. + bed_format (str): The type of bed (SNP-major or INDIVIDUAL-major). + + Reads or write binary Plink files (BED, BIM and FAM). + + .. code-block:: python + + from pyplink import PyPlink + + # Reading BED files + with PyPlink("plink_file_prefix") as bed: + pass + + # Writing BED files + with PyPlink("plink_file_prefix", "w") as bed: + pass + + """ # The genotypes values _geno_values = np.array( @@ -72,18 +95,8 @@ class PyPlink(object): dtype=np.int8, ) - def __init__(self, i_prefix, mode="r", bed_format="SNP-major"): - """Initializes a new PyPlink object. - - Args: - i_prefix (str): the prefix of the binary Plink files - mode (str): the open mode for the binary Plink file - nb_samples (int): the number of samples - bed_format (str): the type of bed (SNP-major or INDIVIDUAL-major) - - Reads or write binary Plink files (BED, BIM and FAM). - - """ + def __init__(self, prefix, mode="r", bed_format="SNP-major"): + """Initializes a new PyPlink instance.""" # The mode self._mode = mode @@ -93,9 +106,9 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"): self._bed_format = bed_format # These are the name of the files - self.bed_filename = "{}.bed".format(i_prefix) - self.bim_filename = "{}.bim".format(i_prefix) - self.fam_filename = "{}.fam".format(i_prefix) + self.bed_filename = "{}.bed".format(prefix) + self.bim_filename = "{}.bim".format(prefix) + self.fam_filename = "{}.fam".format(prefix) if self._mode == "r": if self._bed_format != "SNP-major": @@ -161,12 +174,18 @@ def __exit__(self, *args): self.close() def close(self): - """Closes the BED file if required.""" + """Closes the BED file.""" # Closing the BED file self._bed.close() def next(self): - """The next function.""" + """Returns the next marker. + + Returns: + tuple: The marker name as a string and its genotypes as a + :py:class:`numpy.ndarray`. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -174,7 +193,7 @@ def next(self): if self._n > self._nb_markers: raise StopIteration() - return self._markers[self._n - 1], self._read_current_marker() + return self._bim.index[self._n - 1], self._read_current_marker() def _read_current_marker(self): """Reads the current marker and returns its genotypes.""" @@ -183,7 +202,12 @@ def _read_current_marker(self): ].flatten(order="C")[:self._nb_samples] def seek(self, n): - """Gets to a certain position in the BED file when iterating.""" + """Gets to a certain marker position in the BED file. + + Args: + n (int): The index of the marker to seek to. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -196,30 +220,64 @@ def seek(self, n): raise ValueError("invalid position in BED: {}".format(n)) def _get_seek_position(self, n): - """Gets the seek position in the file (including special bytes).""" + """Gets the seek position in the file (including special bytes). + + Args: + n (int): The index of the marker to seek to. + + """ return 3 + self._nb_bytes * n def _read_bim(self): """Reads the BIM file.""" # Reading the BIM file and setting the values bim = pd.read_csv(self.bim_filename, delim_whitespace=True, - names=["chrom", "snp", "cm", "pos", "a1", "a2"]) - - # The 'snp', 'a1' and 'a2' columns should always be strings - bim["snp"] = bim["snp"].astype(str) - bim["a1"] = bim["a1"].astype(str) - bim["a2"] = bim["a2"].astype(str) - - bim = bim.set_index("snp") - bim["i"] = range(bim.shape[0]) - bim[2] = bim.a1 * 2 # Original '0' - bim[1] = bim.a1 + bim.a2 # Original '2' - bim[0] = bim.a2 * 2 # Original '3' - bim[-1] = "00" # Original 1 - - # Testing something + names=["chrom", "snp", "cm", "pos", "a1", "a2"], + dtype=dict(snp=str, a1=str, a2=str)) + + # Saving the index as integer + bim["i"] = bim.index + + # Checking for duplicated markers + try: + bim = bim.set_index("snp", verify_integrity=True) + self._has_duplicated = False + + except ValueError as e: + # Setting this flag to true + self._has_duplicated = True + + # Finding the duplicated markers + duplicated = bim.snp.duplicated(keep=False) + duplicated_markers = bim.loc[duplicated, "snp"] + duplicated_marker_counts = duplicated_markers.value_counts() + self._dup_markers = set(duplicated_marker_counts.index) + + # Logging a warning + logger.warning("Duplicated markers found") + for marker, count in duplicated_marker_counts.iteritems(): + logger.warning(" - {}: {:,d} times".format(marker, count)) + logger.warning("Appending ':dupX' to the duplicated markers " + "according to their location in the BIM file") + + # Renaming the markers + counter = Counter() + for i, marker in duplicated_markers.iteritems(): + counter[marker] += 1 + new_name = "{}:dup{}".format(marker, counter[marker]) + bim.loc[i, "snp"] = new_name + + # Resetting the index + bim = bim.set_index("snp", verify_integrity=True) + + # Encoding the allele + # - The original 0 is the actual 2 (a1/a1) + # - The original 2 is the actual 1 (a1/a2) + # - The original 3 is the actual 0 (a2/a2) + # - The original 1 is the actual -1 (no call) allele_encoding = np.array( - [bim[0], bim[1], bim[2], bim[-1]], + [bim.a2 * 2, bim.a1 + bim.a2, bim.a1 * 2, + list(repeat("00", bim.shape[0]))], dtype="U2", ) self._allele_encoding = allele_encoding.T @@ -227,35 +285,52 @@ def _read_bim(self): # Saving the data in the object self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]] self._nb_markers = self._bim.shape[0] - self._markers = self._bim.index.values def get_bim(self): - """Returns the BIM file.""" + """Returns the BIM file. + + Returns: + pandas.DataFrame: The BIM file. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._bim.drop("i", axis=1) def get_nb_markers(self): - """Returns the number of markers.""" + """Returns the number of markers. + + Returns: + int: The number of markers in the dataset. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._nb_markers + def get_duplicated_markers(self): + """Returns the duplicated markers, if any. + + Args: + set: The set of duplicated marker (might be empty). + + """ + if self._has_duplicated: + return self._dup_markers + else: + return set() + def _read_fam(self): """Reads the FAM file.""" # Reading the FAM file and setting the values fam = pd.read_csv(self.fam_filename, delim_whitespace=True, names=["fid", "iid", "father", "mother", "gender", - "status"]) - - # The sample IDs should always be strings (more logical that way) - fam["fid"] = fam["fid"].astype(str) - fam["iid"] = fam["iid"].astype(str) - fam["father"] = fam["father"].astype(str) - fam["mother"] = fam["mother"].astype(str) + "status"], + dtype=dict(fid=str, iid=str, father=str, mother=str)) + # Getting the byte and bit location of each samples fam["byte"] = [ int(np.ceil((1 + 1) / 4.0)) - 1 for i in range(len(fam)) ] @@ -266,14 +341,24 @@ def _read_fam(self): self._nb_samples = self._fam.shape[0] def get_fam(self): - """Returns the FAM file.""" + """Returns the FAM file. + + Returns: + pandas.DataFrame: The FAM file. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._fam.drop(["byte", "bit"], axis=1) def get_nb_samples(self): - """Returns the number of samples.""" + """Returns the number of samples. + + Returns: + int: The number of samples in the dataset. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -301,9 +386,9 @@ def _read_bed(self): raise ValueError("not in SNP-major format (please recode): " "{}".format(self.bed_filename)) - # Checking the last entry - seek_position = self._get_seek_position(self._bim.iloc[-1, :].i) - bed_file.seek(seek_position) + # Checking the last entry (for BED corruption) + seek_index = self._get_seek_position(self._bim.iloc[-1, :].i) + bed_file.seek(seek_index) geno = self._geno_values[ np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8) ].flatten(order="C")[:self._nb_samples] @@ -321,7 +406,13 @@ def _write_bed_header(self): self._bed.write(bytearray((108, 27, final_byte))) def iter_geno(self): - """Iterates over genotypes from the beginning.""" + """Iterates over genotypes from the beginning of the BED file. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -332,13 +423,29 @@ def iter_geno(self): return self def iter_acgt_geno(self): - """Iterates over genotypes (ACGT format).""" + """Iterates over genotypes (ACGT format). + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (ACGT format). + + """ # Need to iterate over itself, and modify the actual genotypes for i, (marker, geno) in enumerate(self.iter_geno()): yield marker, self._allele_encoding[i][geno] - def iter_geno_marker(self, markers, return_seek=False): - """Iterates over genotypes for given markers.""" + def iter_geno_marker(self, markers, return_index=False): + """Iterates over genotypes for a list of markers. + + Args: + markers (list): The list of markers to iterate onto. + return_index (bool): Wether to return the marker's index or not. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -347,23 +454,41 @@ def iter_geno_marker(self, markers, return_seek=False): markers = [markers] # Iterating over all markers - if return_seek: + if return_index: for marker in markers: - geno, seek = self.get_geno_marker(marker, return_seek=True) + geno, seek = self.get_geno_marker(marker, return_index=True) yield marker, geno, seek else: for marker in markers: yield marker, self.get_geno_marker(marker) def iter_acgt_geno_marker(self, markers): - """Iterates over genotypes for given markers (ACGT format).""" + """Iterates over genotypes for a list of markers (ACGT format). + + Args: + markers (list): The list of markers to iterate onto. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (ACGT format). + + """ # We iterate over the markers - for snp, geno, s in self.iter_geno_marker(markers, return_seek=True): + for snp, geno, s in self.iter_geno_marker(markers, return_index=True): # Getting the SNP position and converting to ACGT yield snp, self._allele_encoding[s][geno] - def get_geno_marker(self, marker, return_seek=False): - """Gets the genotypes for a given marker.""" + def get_geno_marker(self, marker, return_index=False): + """Gets the genotypes for a given marker. + + Args: + marker (str): The name of the marker. + return_index (bool): Wether to return the marker's index or not. + + Returns: + numpy.ndarray: The genotypes of the marker (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -372,28 +497,36 @@ def get_geno_marker(self, marker, return_seek=False): raise ValueError("{}: marker not in BIM".format(marker)) # Seeking to the correct position - seek_position = self._bim.loc[marker, "i"] - self.seek(seek_position) + seek_index = self._bim.loc[marker, "i"] + self.seek(seek_index) - if return_seek: - return self._read_current_marker(), seek_position + if return_index: + return self._read_current_marker(), seek_index return self._read_current_marker() def get_acgt_geno_marker(self, marker): - """Gets the genotypes for a given marker (ACGT format).""" + """Gets the genotypes for a given marker (ACGT format). + + Args: + marker (str): The name of the marker. + + Returns: + numpy.ndarray: The genotypes of the marker (ACGT format). + + """ # Getting the marker's genotypes - geno, snp_position = self.get_geno_marker(marker, return_seek=True) + geno, snp_position = self.get_geno_marker(marker, return_index=True) # Returning the ACGT's format of the genotypes return self._allele_encoding[snp_position][geno] - def write_marker(self, genotypes): - """Deprecated function.""" - warnings.warn("deprecated: use 'write_genotypes'", DeprecationWarning) - self.write_genotypes(genotypes) - def write_genotypes(self, genotypes): - """Write genotypes to binary file.""" + """Write genotypes to binary file. + + Args: + genotypes (numpy.ndarray): The genotypes to write in the BED file. + + """ if self._mode != "w": raise UnsupportedOperation("not available in 'r' mode") @@ -417,6 +550,15 @@ def write_genotypes(self, genotypes): @staticmethod def _grouper(iterable, n, fillvalue=0): - """Collect data into fixed-length chunks or blocks.""" + """Collect data into fixed-length chunks or blocks. + + Args: + n (int): The size of the chunk. + fillvalue (int): The fill value. + + Returns: + iterator: An iterator over the chunks. + + """ args = [iter(iterable)] * n return zip_longest(fillvalue=fillvalue, *args) diff --git a/pyplink/tests/__main__.py b/pyplink/tests/__main__.py new file mode 100644 index 0000000..96cdc50 --- /dev/null +++ b/pyplink/tests/__main__.py @@ -0,0 +1,36 @@ +# This file is part of pyplink. +# +# The MIT License (MIT) +# +# Copyright (c) 2014 Louis-Philippe Lemieux Perreault +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import unittest + +from . import test_suite + + +__author__ = "Louis-Philippe Lemieux Perreault" +__copyright__ = "Copyright 2014 Louis-Philippe Lemieux Perreault" +__license__ = "MIT" + + +unittest.TextTestRunner(verbosity=1).run(test_suite) diff --git a/pyplink/tests/data/test_data.bim b/pyplink/tests/data/test_data.bim index 2bb9231..10df21c 100644 --- a/pyplink/tests/data/test_data.bim +++ b/pyplink/tests/data/test_data.bim @@ -1,10 +1,10 @@ 1 rs10399749 0 45162 G C -1 rs2949420 0 45257 C T -1 rs2949421 0 45413 0 0 -1 rs2691310 0 46844 A T -1 rs4030303 0 72434 0 G -1 rs4030300 0 72515 0 C -1 rs3855952 0 77689 G A -1 rs940550 0 78032 0 T -1 rs13328714 0 81468 G C -1 rs11490937 0 222077 A G +2 rs2949420 1 45257 C T +3 rs2949421 1 45413 0 0 +4 rs2691310 2 46844 A T +4 rs4030303 2 72434 0 G +5 rs4030300 3 72515 0 C +6 rs3855952 4 77689 G A +6 rs940550 4 78032 0 T +6 rs13328714 5 81468 G C +8 rs11490937 6 222077 A G diff --git a/pyplink/tests/data/test_data_with_dup.bim b/pyplink/tests/data/test_data_with_dup.bim new file mode 100644 index 0000000..a67858a --- /dev/null +++ b/pyplink/tests/data/test_data_with_dup.bim @@ -0,0 +1,10 @@ +1 rs10399749 0 45162 G C +2 rs2949420 1 45257 C T +3 rs2949421 1 45413 0 0 +4 rs2691310 2 46844 A T +4 rs4030303 2 72434 0 G +5 rs4030303 3 72515 0 C +6 rs4030303 4 77689 G A +6 rs940550 4 78032 0 T +6 rs940550 5 81468 G C +8 rs2949420 6 222077 A G diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index 254de22..afee1d0 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -33,7 +33,6 @@ import zipfile import platform import unittest -import warnings from tempfile import mkdtemp from io import UnsupportedOperation from distutils.spawn import find_executable @@ -49,6 +48,11 @@ except ImportError: from urllib import urlretrieve +try: + from unittest import mock +except ImportError: + import mock + from pkg_resources import resource_filename import numpy as np @@ -56,7 +60,7 @@ from six.moves import range -from ..pyplink import PyPlink +from .. import pyplink def get_plink(tmp_dir): @@ -171,7 +175,7 @@ def setUpClass(cls): def setUp(self): # Reading the plink binary file - self.pedfile = PyPlink(self.prefix) + self.pedfile = pyplink.PyPlink(self.prefix) @classmethod def tearDownClass(cls): @@ -244,7 +248,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("invalid number of entries: corrupted BED?", str(cm.exception)) @@ -255,7 +259,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("not a valid BED file: {}".format(new_bed), str(cm.exception)) @@ -266,7 +270,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("not a valid BED file: {}".format(new_bed), str(cm.exception)) @@ -277,7 +281,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual( "not in SNP-major format (please recode): {}".format(new_bed), str(cm.exception), @@ -295,7 +299,7 @@ def test_missing_files(self): for extension in (".bed", ".bim", ".fam"): os.remove(prefix + extension) with self.assertRaises(IOError) as cm: - PyPlink(prefix) + pyplink.PyPlink(prefix) self.assertEqual("No such file: '{}'".format(prefix + extension), str(cm.exception)) with open(prefix + extension, "w"): @@ -309,7 +313,8 @@ def test_get_nb_markers_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_nb_markers() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -321,7 +326,8 @@ def test_get_nb_samples_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_nb_samples() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -331,10 +337,10 @@ def test_get_bim(self): ori_bim = self.pedfile._bim # The expected values - chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8] positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032, 81468, 222077] - cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6] a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"] a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"] @@ -375,7 +381,8 @@ def test_get_bim_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_bim() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -432,7 +439,8 @@ def test_get_fam_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_fam() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -455,7 +463,8 @@ def test_generator_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -471,7 +480,8 @@ def test_next_w_mode(self): """Tests that an exception is raised when calling next in w mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.next() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -533,7 +543,8 @@ def test_seek_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.seek(100) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -551,7 +562,8 @@ def test_iter_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_geno()) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -569,7 +581,8 @@ def test_iter_acgt_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_acgt_geno()) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -609,7 +622,8 @@ def test_iter_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_geno_marker(["M1", "M2"])) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -649,7 +663,8 @@ def test_iter_acgt_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_acgt_geno_marker(["M1", "M2"])) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -681,7 +696,8 @@ def test_repr_w_mode(self): e_repr = 'PyPlink(mode="w")' # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") as pedfile: + prefix = os.path.join(self.tmp_dir, "test_repr") + with pyplink.PyPlink(prefix, "w") as pedfile: # Comparing the expected with the observed representation o_repr = str(pedfile) self.assertEqual(e_repr, o_repr) @@ -709,7 +725,8 @@ def test_get_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -717,7 +734,8 @@ def test_get_iter_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: iter(p) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -741,19 +759,20 @@ def test_get_acgt_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_acgt_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) def test_get_context_read_mode(self): """Tests the PyPlink object as context manager.""" - with PyPlink(self.prefix) as genotypes: + with pyplink.PyPlink(self.prefix) as genotypes: self.assertEqual(3, len(genotypes.get_fam().head(n=3))) def test_invalid_mode(self): """Tests invalid mode when PyPlink as context manager.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, "u") + pyplink.PyPlink(self.prefix, "u") self.assertEqual("invalid mode: 'u'", str(cm.exception)) def test_write_binary(self): @@ -769,7 +788,7 @@ def test_write_binary(self): test_prefix = os.path.join(self.tmp_dir, "test_write") # Writing the binary file - with PyPlink(test_prefix, "w") as pedfile: + with pyplink.PyPlink(test_prefix, "w") as pedfile: for genotypes in expected_genotypes: pedfile.write_genotypes(genotypes) @@ -789,7 +808,7 @@ def test_write_binary(self): file=o_file) # Reading the written binary file - with PyPlink(test_prefix) as pedfile: + with pyplink.PyPlink(test_prefix) as pedfile: for i, (marker, genotypes) in enumerate(pedfile): self.assertEqual("m{}".format(i+1), marker) np.testing.assert_array_equal(expected_genotypes[i], genotypes) @@ -808,7 +827,7 @@ def test_write_binary_error(self): # Writing the binary file with self.assertRaises(ValueError) as cm: - with PyPlink(test_prefix, "w") as pedfile: + with pyplink.PyPlink(test_prefix, "w") as pedfile: pedfile.write_genotypes(expected_genotypes[0]) # 7 genotypes pedfile.write_genotypes(expected_genotypes[1]) # 6 genotypes self.assertEqual("7 samples expected, got 6", str(cm.exception)) @@ -821,7 +840,7 @@ def test_grouper_padding(self): (6, 7, 8), (9, 0, 0), ] - observed_chunks = PyPlink._grouper(range(10), 3) + observed_chunks = pyplink.PyPlink._grouper(range(10), 3) for expected, observed in zip(expected_chunks, observed_chunks): self.assertEqual(expected, observed) @@ -831,7 +850,7 @@ def test_grouper_no_padding(self): (0, 1, 2, 3, 4), (5, 6, 7, 8, 9), ] - observed_chunks = PyPlink._grouper(range(10), 5) + observed_chunks = pyplink.PyPlink._grouper(range(10), 5) for expected, observed in zip(expected_chunks, observed_chunks): self.assertEqual(expected, observed) @@ -850,7 +869,7 @@ def test_with_plink(self): [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], ] prefix = os.path.join(self.tmp_dir, "test_output") - with PyPlink(prefix, "w") as pedfile: + with pyplink.PyPlink(prefix, "w") as pedfile: for genotypes in all_genotypes: pedfile.write_genotypes(genotypes) @@ -950,7 +969,7 @@ def test_with_plink_individual_major(self): # Creating the BED file (INDIVIDUAL-major) prefix = os.path.join(self.tmp_dir, "test_output") - with PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile: + with pyplink.PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile: for genotypes in transposed_genotypes: pedfile.write_genotypes(genotypes) @@ -1032,7 +1051,7 @@ def test_with_plink_individual_major(self): def test_wrong_bed_format(self): """Tests opening a BED file with unknown format.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, bed_format="UNKNOWN-major") + pyplink.PyPlink(self.prefix, bed_format="UNKNOWN-major") self.assertEqual( "invalid bed format: UNKNOWN-major", str(cm.exception), @@ -1041,7 +1060,7 @@ def test_wrong_bed_format(self): def test_invalid_bed_format_with_r_mode(self): """Tests an invalid BED format with r mode.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, bed_format="INDIVIDUAL-major") + pyplink.PyPlink(self.prefix, bed_format="INDIVIDUAL-major") self.assertEqual( "only SNP-major format is supported with mode 'r'", str(cm.exception), @@ -1053,14 +1072,67 @@ def test_write_genotypes_in_r_mode(self): self.pedfile.write_genotypes([0, 0, 0]) self.assertEqual("not available in 'r' mode", str(cm.exception)) - def test_write_marker_deprecation_warning(self): - """Tests that a deprecation warning is triggered.""" - with warnings.catch_warnings(record=True) as w: - with PyPlink(os.path.join(self.tmp_dir, "test_warns"), "w") as p: - p.write_marker([0, 0, 0]) - self.assertEqual(1, len(w)) - self.assertTrue(issubclass(w[0].category, DeprecationWarning)) - self.assertEqual( - "deprecated: use 'write_genotypes'", - str(w[0].message), - ) + @mock.patch.object(pyplink, "logger") + def test_dup_markers(self, pyplink_logger): + """Tests when there are duplicated markers.""" + # Checking the original one has no duplicates + self.assertEqual(len(self.pedfile.get_duplicated_markers()), 0) + + # Copying the BED and the FAM to the temporary directory + new_prefix = os.path.join(self.tmp_dir, "with_dup") + for suffix in (".bed", ".fam"): + shutil.copyfile(self.prefix + suffix, new_prefix + suffix) + + # Copying the BIM file to the temporary directory + shutil.copyfile(self.prefix + "_with_dup.bim", new_prefix + ".bim") + + # Reading the new files + pedfile = pyplink.PyPlink(new_prefix) + + # Checking a warning was called + self.assertTrue(pyplink_logger.warning.called) + + # Checking the BIM + chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8] + markers = ["rs10399749", "rs2949420:dup1", "rs2949421", "rs2691310", + "rs4030303:dup1", "rs4030303:dup2", "rs4030303:dup3", + "rs940550:dup1", "rs940550:dup2", "rs2949420:dup2"] + positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032, + 81468, 222077] + cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6] + a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"] + a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"] + + # Getting the BIM file + bim = pedfile.get_bim() + + # Checking the columns + self.assertTrue( + set(bim.columns.values) == {"chrom", "pos", "cm", "a1", "a2"} + ) + + # Checking the indexes + self.assertTrue(set(bim.index.values) == set(markers)) + + # Checking the values for the markers + zipped = zip(markers, chromosomes, positions, cms, a1s, a2s) + for marker, chrom, pos, cm, a1, a2 in zipped: + self.assertEqual(chrom, bim.loc[marker, "chrom"]) + self.assertEqual(pos, bim.loc[marker, "pos"]) + self.assertEqual(cm, bim.loc[marker, "cm"]) + self.assertEqual(a1, bim.loc[marker, "a1"]) + self.assertEqual(a2, bim.loc[marker, "a2"]) + + # Checking only one duplicated markers + for i, marker in enumerate(markers): + geno = pedfile.get_geno_marker(marker) + np.testing.assert_array_equal(geno, self.genotypes[i]) + + # Checking the list of duplicated markers + self.assertEqual( + set(m.split(":")[0] for m in markers if ":" in m), + pedfile.get_duplicated_markers(), + ) + + # Closing the file + pedfile.close() diff --git a/setup.py b/setup.py index e27588e..1592560 100644 --- a/setup.py +++ b/setup.py @@ -11,12 +11,13 @@ import os +import sys from setuptools import setup MAJOR = 1 MINOR = 3 -MICRO = 1 +MICRO = 2 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO) @@ -39,6 +40,17 @@ def write_version_file(fn=None): a.close() +def get_requirements(): + # Initial requirements + requirements = ["numpy >= 1.8.2", "pandas >= 0.14.1", "six >= 1.9.0"] + + # Checking if python 2 (requires mock) + if sys.version_info[0] == 2: + requirements.append(["mock >= 2.0.0"]) + + return requirements + + def setup_package(): # Saving the version into a file write_version_file() @@ -55,8 +67,7 @@ def setup_package(): packages=["pyplink", "pyplink.tests"], package_data={"pyplink.tests": ["data/test_data.*"], }, test_suite="pyplink.tests.test_suite", - install_requires=["numpy >= 1.8.2", "pandas >= 0.14.1", - "six >= 1.9.0"], + install_requires=get_requirements(), classifiers=["Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft",