From 999503071ff31ff1c32dd91ab7e74633dcd27f4c Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Wed, 8 Mar 2017 14:10:22 -0500 Subject: [PATCH 01/18] Removed the deprecation warning --- pyplink/pyplink.py | 12 +----------- pyplink/tests/test_pyplink.py | 13 ------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index f09e5ad..0a90766 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -26,7 +26,6 @@ import os -import warnings from io import UnsupportedOperation try: @@ -48,10 +47,6 @@ __all__ = ["PyPlink"] -# Allowing for warnings -warnings.simplefilter("once", DeprecationWarning) - - # The recoding values _geno_recode = {1: -1, # Unknown genotype 2: 1, # Heterozygous genotype @@ -217,7 +212,7 @@ def _read_bim(self): bim[0] = bim.a2 * 2 # Original '3' bim[-1] = "00" # Original 1 - # Testing something + # Decoding the allele allele_encoding = np.array( [bim[0], bim[1], bim[2], bim[-1]], dtype="U2", @@ -387,11 +382,6 @@ def get_acgt_geno_marker(self, marker): # Returning the ACGT's format of the genotypes return self._allele_encoding[snp_position][geno] - def write_marker(self, genotypes): - """Deprecated function.""" - warnings.warn("deprecated: use 'write_genotypes'", DeprecationWarning) - self.write_genotypes(genotypes) - def write_genotypes(self, genotypes): """Write genotypes to binary file.""" if self._mode != "w": diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index 254de22..4e29f16 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -33,7 +33,6 @@ import zipfile import platform import unittest -import warnings from tempfile import mkdtemp from io import UnsupportedOperation from distutils.spawn import find_executable @@ -1052,15 +1051,3 @@ def test_write_genotypes_in_r_mode(self): with self.assertRaises(UnsupportedOperation) as cm: self.pedfile.write_genotypes([0, 0, 0]) self.assertEqual("not available in 'r' mode", str(cm.exception)) - - def test_write_marker_deprecation_warning(self): - """Tests that a deprecation warning is triggered.""" - with warnings.catch_warnings(record=True) as w: - with PyPlink(os.path.join(self.tmp_dir, "test_warns"), "w") as p: - p.write_marker([0, 0, 0]) - self.assertEqual(1, len(w)) - self.assertTrue(issubclass(w[0].category, DeprecationWarning)) - self.assertEqual( - "deprecated: use 'write_genotypes'", - str(w[0].message), - ) From 69a614d15d9bfa7ed6e3e69b2f71b63b9510fed4 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 12:56:28 -0500 Subject: [PATCH 02/18] Better way to make sure that snp name, alleles and sample IDs are always string --- pyplink/pyplink.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 0a90766..4d3154e 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -26,6 +26,7 @@ import os +import logging from io import UnsupportedOperation try: @@ -47,6 +48,10 @@ __all__ = ["PyPlink"] +# The logger +logger = logging.getLogger(__name__) + + # The recoding values _geno_recode = {1: -1, # Unknown genotype 2: 1, # Heterozygous genotype @@ -198,12 +203,8 @@ def _read_bim(self): """Reads the BIM file.""" # Reading the BIM file and setting the values bim = pd.read_csv(self.bim_filename, delim_whitespace=True, - names=["chrom", "snp", "cm", "pos", "a1", "a2"]) - - # The 'snp', 'a1' and 'a2' columns should always be strings - bim["snp"] = bim["snp"].astype(str) - bim["a1"] = bim["a1"].astype(str) - bim["a2"] = bim["a2"].astype(str) + names=["chrom", "snp", "cm", "pos", "a1", "a2"], + dtype=dict(snp=str, a1=str, a2=str)) bim = bim.set_index("snp") bim["i"] = range(bim.shape[0]) @@ -243,14 +244,10 @@ def _read_fam(self): # Reading the FAM file and setting the values fam = pd.read_csv(self.fam_filename, delim_whitespace=True, names=["fid", "iid", "father", "mother", "gender", - "status"]) - - # The sample IDs should always be strings (more logical that way) - fam["fid"] = fam["fid"].astype(str) - fam["iid"] = fam["iid"].astype(str) - fam["father"] = fam["father"].astype(str) - fam["mother"] = fam["mother"].astype(str) + "status"], + dtype=dict(fid=str, iid=str, father=str, mother=str)) + # Getting the byte and bit location of each samples fam["byte"] = [ int(np.ceil((1 + 1) / 4.0)) - 1 for i in range(len(fam)) ] From 41db621799c637943c5ce942cdd96e1a8046fbe4 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 13:17:19 -0500 Subject: [PATCH 03/18] Different way to create the encode key to increase speed --- pyplink/pyplink.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 4d3154e..dc2b6d3 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -27,6 +27,7 @@ import os import logging +from itertools import repeat from io import UnsupportedOperation try: @@ -206,16 +207,20 @@ def _read_bim(self): names=["chrom", "snp", "cm", "pos", "a1", "a2"], dtype=dict(snp=str, a1=str, a2=str)) + # Saving the index as integer + bim["i"] = bim.index + + # Checking for duplicated markers bim = bim.set_index("snp") - bim["i"] = range(bim.shape[0]) - bim[2] = bim.a1 * 2 # Original '0' - bim[1] = bim.a1 + bim.a2 # Original '2' - bim[0] = bim.a2 * 2 # Original '3' - bim[-1] = "00" # Original 1 - # Decoding the allele + # Encoding the allele + # - The original 0 is the actual 2 (a1/a1) + # - The original 2 is the actual 1 (a1/a2) + # - The original 3 is the actual 0 (a2/a2) + # - The original 1 is the actual -1 (no call) allele_encoding = np.array( - [bim[0], bim[1], bim[2], bim[-1]], + [bim.a2 * 2, bim.a1 + bim.a2, bim.a1 * 2, + list(repeat("00", bim.shape[0]))], dtype="U2", ) self._allele_encoding = allele_encoding.T From 74663308f1302d339d648fffe8301f2bd1fd9232 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 13:19:57 -0500 Subject: [PATCH 04/18] No need for the list markers (already in the index) --- pyplink/pyplink.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index dc2b6d3..27088c8 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -175,7 +175,7 @@ def next(self): if self._n > self._nb_markers: raise StopIteration() - return self._markers[self._n - 1], self._read_current_marker() + return self._bim.index[self._n - 1], self._read_current_marker() def _read_current_marker(self): """Reads the current marker and returns its genotypes.""" @@ -228,7 +228,6 @@ def _read_bim(self): # Saving the data in the object self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]] self._nb_markers = self._bim.shape[0] - self._markers = self._bim.index.values def get_bim(self): """Returns the BIM file.""" From a01a44cb930db491d16a2dc938ad40f4c32db8a2 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 15:01:23 -0500 Subject: [PATCH 05/18] Fixes #2: Can now work with duplicated markers --- pyplink/pyplink.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 27088c8..8bda865 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -28,6 +28,7 @@ import os import logging from itertools import repeat +from collections import Counter from io import UnsupportedOperation try: @@ -211,7 +212,36 @@ def _read_bim(self): bim["i"] = bim.index # Checking for duplicated markers - bim = bim.set_index("snp") + try: + bim = bim.set_index("snp", verify_integrity=True) + self._has_duplicated = False + + except ValueError as e: + # Setting this flag to true + self._has_duplicated = True + + # Finding the duplicated markers + duplicated = bim.snp.duplicated(keep=False) + duplicated_markers = bim.loc[duplicated, "snp"] + duplicated_marker_counts = duplicated_markers.value_counts() + self._dup_markers = set(duplicated_marker_counts.index) + + # Logging a warning + logger.warning("Duplicated markers found") + for marker, count in duplicated_marker_counts.iteritems(): + logger.warning(" - {}: {:,d} times".format(marker, count)) + logger.warning("Appending ':dupX' to the duplicated markers " + "according to their location in the BIM file") + + # Renaming the markers + counter = Counter() + for i, marker in duplicated_markers.iteritems(): + counter[marker] += 1 + new_name = "{}:dup{}".format(marker, counter[marker]) + bim.loc[i, "snp"] = new_name + + # Resetting the index + bim = bim.set_index("snp", verify_integrity=True) # Encoding the allele # - The original 0 is the actual 2 (a1/a1) @@ -243,6 +273,13 @@ def get_nb_markers(self): return self._nb_markers + def get_duplicated_markers(self): + """Returns the duplicated markers, if any.""" + if self._has_duplicated: + return self._dup_markers + else: + return set() + def _read_fam(self): """Reads the FAM file.""" # Reading the FAM file and setting the values @@ -297,7 +334,7 @@ def _read_bed(self): raise ValueError("not in SNP-major format (please recode): " "{}".format(self.bed_filename)) - # Checking the last entry + # Checking the last entry (for BED corruption) seek_position = self._get_seek_position(self._bim.iloc[-1, :].i) bed_file.seek(seek_position) geno = self._geno_values[ From 820e1bd1c7e13e84c0967b31b09141aca177437a Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 16:10:38 -0500 Subject: [PATCH 06/18] Added a test for the duplicated markers --- pyplink/tests/data/test_data_with_dup.bim | 10 ++ pyplink/tests/test_pyplink.py | 152 +++++++++++++++++----- 2 files changed, 129 insertions(+), 33 deletions(-) create mode 100644 pyplink/tests/data/test_data_with_dup.bim diff --git a/pyplink/tests/data/test_data_with_dup.bim b/pyplink/tests/data/test_data_with_dup.bim new file mode 100644 index 0000000..df83cbe --- /dev/null +++ b/pyplink/tests/data/test_data_with_dup.bim @@ -0,0 +1,10 @@ +1 rs10399749 0 45162 G C +1 rs2949420 0 45257 C T +1 rs2949421 0 45413 0 0 +1 rs2691310 0 46844 A T +1 rs4030303 0 72434 0 G +1 rs4030303 0 72515 0 C +1 rs4030303 0 77689 G A +1 rs940550 0 78032 0 T +1 rs940550 0 81468 G C +1 rs2949420 0 222077 A G diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index 4e29f16..4af424a 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -48,6 +48,11 @@ except ImportError: from urllib import urlretrieve +try: + from unittest import mock +except ImportError: + import mock + from pkg_resources import resource_filename import numpy as np @@ -55,7 +60,7 @@ from six.moves import range -from ..pyplink import PyPlink +from .. import pyplink def get_plink(tmp_dir): @@ -170,7 +175,7 @@ def setUpClass(cls): def setUp(self): # Reading the plink binary file - self.pedfile = PyPlink(self.prefix) + self.pedfile = pyplink.PyPlink(self.prefix) @classmethod def tearDownClass(cls): @@ -243,7 +248,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("invalid number of entries: corrupted BED?", str(cm.exception)) @@ -254,7 +259,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("not a valid BED file: {}".format(new_bed), str(cm.exception)) @@ -265,7 +270,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual("not a valid BED file: {}".format(new_bed), str(cm.exception)) @@ -276,7 +281,7 @@ def test_pyplink_bad_bed(self): # This should raise an exception with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) + pyplink.PyPlink(new_prefix) self.assertEqual( "not in SNP-major format (please recode): {}".format(new_bed), str(cm.exception), @@ -294,7 +299,7 @@ def test_missing_files(self): for extension in (".bed", ".bim", ".fam"): os.remove(prefix + extension) with self.assertRaises(IOError) as cm: - PyPlink(prefix) + pyplink.PyPlink(prefix) self.assertEqual("No such file: '{}'".format(prefix + extension), str(cm.exception)) with open(prefix + extension, "w"): @@ -308,7 +313,8 @@ def test_get_nb_markers_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_nb_markers() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -320,10 +326,12 @@ def test_get_nb_samples_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_nb_samples() self.assertEqual("not available in 'w' mode", str(cm.exception)) + # TODO: Change some chromosomes and cm just to be sure def test_get_bim(self): """Tests the 'get_bim' function.""" # The original BIM file (with the 'i' column) @@ -374,7 +382,8 @@ def test_get_bim_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_bim() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -431,7 +440,8 @@ def test_get_fam_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_fam() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -454,7 +464,8 @@ def test_generator_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -470,7 +481,8 @@ def test_next_w_mode(self): """Tests that an exception is raised when calling next in w mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.next() self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -532,7 +544,8 @@ def test_seek_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.seek(100) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -550,7 +563,8 @@ def test_iter_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_geno()) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -568,7 +582,8 @@ def test_iter_acgt_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_acgt_geno()) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -608,7 +623,8 @@ def test_iter_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_geno_marker(["M1", "M2"])) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -648,7 +664,8 @@ def test_iter_acgt_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: marker, genotypes = next(p.iter_acgt_geno_marker(["M1", "M2"])) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -680,7 +697,8 @@ def test_repr_w_mode(self): e_repr = 'PyPlink(mode="w")' # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") as pedfile: + prefix = os.path.join(self.tmp_dir, "test_repr") + with pyplink.PyPlink(prefix, "w") as pedfile: # Comparing the expected with the observed representation o_repr = str(pedfile) self.assertEqual(e_repr, o_repr) @@ -708,7 +726,8 @@ def test_get_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -716,7 +735,8 @@ def test_get_iter_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: iter(p) self.assertEqual("not available in 'w' mode", str(cm.exception)) @@ -740,19 +760,20 @@ def test_get_acgt_geno_marker_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object - with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: + prefix = os.path.join(self.tmp_dir, "test_error") + with pyplink.PyPlink(prefix, "w") as p: p.get_acgt_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) def test_get_context_read_mode(self): """Tests the PyPlink object as context manager.""" - with PyPlink(self.prefix) as genotypes: + with pyplink.PyPlink(self.prefix) as genotypes: self.assertEqual(3, len(genotypes.get_fam().head(n=3))) def test_invalid_mode(self): """Tests invalid mode when PyPlink as context manager.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, "u") + pyplink.PyPlink(self.prefix, "u") self.assertEqual("invalid mode: 'u'", str(cm.exception)) def test_write_binary(self): @@ -768,7 +789,7 @@ def test_write_binary(self): test_prefix = os.path.join(self.tmp_dir, "test_write") # Writing the binary file - with PyPlink(test_prefix, "w") as pedfile: + with pyplink.PyPlink(test_prefix, "w") as pedfile: for genotypes in expected_genotypes: pedfile.write_genotypes(genotypes) @@ -788,7 +809,7 @@ def test_write_binary(self): file=o_file) # Reading the written binary file - with PyPlink(test_prefix) as pedfile: + with pyplink.PyPlink(test_prefix) as pedfile: for i, (marker, genotypes) in enumerate(pedfile): self.assertEqual("m{}".format(i+1), marker) np.testing.assert_array_equal(expected_genotypes[i], genotypes) @@ -807,7 +828,7 @@ def test_write_binary_error(self): # Writing the binary file with self.assertRaises(ValueError) as cm: - with PyPlink(test_prefix, "w") as pedfile: + with pyplink.PyPlink(test_prefix, "w") as pedfile: pedfile.write_genotypes(expected_genotypes[0]) # 7 genotypes pedfile.write_genotypes(expected_genotypes[1]) # 6 genotypes self.assertEqual("7 samples expected, got 6", str(cm.exception)) @@ -820,7 +841,7 @@ def test_grouper_padding(self): (6, 7, 8), (9, 0, 0), ] - observed_chunks = PyPlink._grouper(range(10), 3) + observed_chunks = pyplink.PyPlink._grouper(range(10), 3) for expected, observed in zip(expected_chunks, observed_chunks): self.assertEqual(expected, observed) @@ -830,7 +851,7 @@ def test_grouper_no_padding(self): (0, 1, 2, 3, 4), (5, 6, 7, 8, 9), ] - observed_chunks = PyPlink._grouper(range(10), 5) + observed_chunks = pyplink.PyPlink._grouper(range(10), 5) for expected, observed in zip(expected_chunks, observed_chunks): self.assertEqual(expected, observed) @@ -849,7 +870,7 @@ def test_with_plink(self): [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], ] prefix = os.path.join(self.tmp_dir, "test_output") - with PyPlink(prefix, "w") as pedfile: + with pyplink.PyPlink(prefix, "w") as pedfile: for genotypes in all_genotypes: pedfile.write_genotypes(genotypes) @@ -949,7 +970,7 @@ def test_with_plink_individual_major(self): # Creating the BED file (INDIVIDUAL-major) prefix = os.path.join(self.tmp_dir, "test_output") - with PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile: + with pyplink.PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile: for genotypes in transposed_genotypes: pedfile.write_genotypes(genotypes) @@ -1031,7 +1052,7 @@ def test_with_plink_individual_major(self): def test_wrong_bed_format(self): """Tests opening a BED file with unknown format.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, bed_format="UNKNOWN-major") + pyplink.PyPlink(self.prefix, bed_format="UNKNOWN-major") self.assertEqual( "invalid bed format: UNKNOWN-major", str(cm.exception), @@ -1040,7 +1061,7 @@ def test_wrong_bed_format(self): def test_invalid_bed_format_with_r_mode(self): """Tests an invalid BED format with r mode.""" with self.assertRaises(ValueError) as cm: - PyPlink(self.prefix, bed_format="INDIVIDUAL-major") + pyplink.PyPlink(self.prefix, bed_format="INDIVIDUAL-major") self.assertEqual( "only SNP-major format is supported with mode 'r'", str(cm.exception), @@ -1051,3 +1072,68 @@ def test_write_genotypes_in_r_mode(self): with self.assertRaises(UnsupportedOperation) as cm: self.pedfile.write_genotypes([0, 0, 0]) self.assertEqual("not available in 'r' mode", str(cm.exception)) + + @mock.patch.object(pyplink, "logger") + def test_dup_markers(self, pyplink_logger): + """Tests when there are duplicated markers.""" + # Checking the original one has no duplicates + self.assertEqual(len(self.pedfile.get_duplicated_markers()), 0) + + # Copying the BED and the FAM to the temporary directory + new_prefix = os.path.join(self.tmp_dir, "with_dup") + for suffix in (".bed", ".fam"): + shutil.copyfile(self.prefix + suffix, new_prefix + suffix) + + # Copying the BIM file to the temporary directory + shutil.copyfile(self.prefix + "_with_dup.bim", new_prefix + ".bim") + + # Reading the new files + pedfile = pyplink.PyPlink(new_prefix) + + # Checking a warning was called + self.assertTrue(pyplink_logger.warning.called) + + # Checking the BIM + chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + markers = ["rs10399749", "rs2949420:dup1", "rs2949421", "rs2691310", + "rs4030303:dup1", "rs4030303:dup2", "rs4030303:dup3", + "rs940550:dup1", "rs940550:dup2", "rs2949420:dup2"] + positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032, + 81468, 222077] + cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"] + a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"] + + # Getting the BIM file + bim = pedfile.get_bim() + + # Checking the columns + self.assertTrue( + set(bim.columns.values) == {"chrom", "pos", "cm", "a1", "a2"} + ) + + # Checking the indexes + self.assertTrue(set(bim.index.values) == set(markers)) + + # Checking the values for the markers + zipped = zip(markers, chromosomes, positions, cms, a1s, a2s) + for marker, chrom, pos, cm, a1, a2 in zipped: + self.assertEqual(chrom, bim.loc[marker, "chrom"]) + self.assertEqual(pos, bim.loc[marker, "pos"]) + self.assertEqual(cm, bim.loc[marker, "cm"]) + self.assertEqual(a1, bim.loc[marker, "a1"]) + self.assertEqual(a2, bim.loc[marker, "a2"]) + + # Checking only one duplicated markers + for i, marker in enumerate(markers): + geno = pedfile.get_geno_marker(marker) + np.testing.assert_array_equal(geno, self.genotypes[i]) + + # Checking the list of duplicated markers + self.assertEqual( + set(m.split(":")[0] for m in markers if ":" in m), + pedfile.get_duplicated_markers(), + ) + + # Closing the file + pedfile.close() From 44d85e0828ad04404af466f58cc49cbba9b0ea26 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 16:10:55 -0500 Subject: [PATCH 07/18] Added the dependencies for mock (pyhon 2) --- setup.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e27588e..4589db7 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ import os +import sys from setuptools import setup @@ -39,6 +40,17 @@ def write_version_file(fn=None): a.close() +def get_requirements(): + # Initial requirements + requirements = ["numpy >= 1.8.2", "pandas >= 0.14.1", "six >= 1.9.0"] + + # Checking if python 2 (requires mock) + if sys.version_info[0] == 2: + requirements.append(["mock >= 2.0.0"]) + + return requirements + + def setup_package(): # Saving the version into a file write_version_file() @@ -55,8 +67,7 @@ def setup_package(): packages=["pyplink", "pyplink.tests"], package_data={"pyplink.tests": ["data/test_data.*"], }, test_suite="pyplink.tests.test_suite", - install_requires=["numpy >= 1.8.2", "pandas >= 0.14.1", - "six >= 1.9.0"], + install_requires=get_requirements(), classifiers=["Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft", From 036b23f4cf177f1cf5d2253f5a645848d9f0735d Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 16:21:27 -0500 Subject: [PATCH 08/18] Changed the chromosome and cm of the test datasets --- pyplink/tests/data/test_data.bim | 18 +++++++++--------- pyplink/tests/data/test_data_with_dup.bim | 18 +++++++++--------- pyplink/tests/test_pyplink.py | 9 ++++----- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pyplink/tests/data/test_data.bim b/pyplink/tests/data/test_data.bim index 2bb9231..10df21c 100644 --- a/pyplink/tests/data/test_data.bim +++ b/pyplink/tests/data/test_data.bim @@ -1,10 +1,10 @@ 1 rs10399749 0 45162 G C -1 rs2949420 0 45257 C T -1 rs2949421 0 45413 0 0 -1 rs2691310 0 46844 A T -1 rs4030303 0 72434 0 G -1 rs4030300 0 72515 0 C -1 rs3855952 0 77689 G A -1 rs940550 0 78032 0 T -1 rs13328714 0 81468 G C -1 rs11490937 0 222077 A G +2 rs2949420 1 45257 C T +3 rs2949421 1 45413 0 0 +4 rs2691310 2 46844 A T +4 rs4030303 2 72434 0 G +5 rs4030300 3 72515 0 C +6 rs3855952 4 77689 G A +6 rs940550 4 78032 0 T +6 rs13328714 5 81468 G C +8 rs11490937 6 222077 A G diff --git a/pyplink/tests/data/test_data_with_dup.bim b/pyplink/tests/data/test_data_with_dup.bim index df83cbe..a67858a 100644 --- a/pyplink/tests/data/test_data_with_dup.bim +++ b/pyplink/tests/data/test_data_with_dup.bim @@ -1,10 +1,10 @@ 1 rs10399749 0 45162 G C -1 rs2949420 0 45257 C T -1 rs2949421 0 45413 0 0 -1 rs2691310 0 46844 A T -1 rs4030303 0 72434 0 G -1 rs4030303 0 72515 0 C -1 rs4030303 0 77689 G A -1 rs940550 0 78032 0 T -1 rs940550 0 81468 G C -1 rs2949420 0 222077 A G +2 rs2949420 1 45257 C T +3 rs2949421 1 45413 0 0 +4 rs2691310 2 46844 A T +4 rs4030303 2 72434 0 G +5 rs4030303 3 72515 0 C +6 rs4030303 4 77689 G A +6 rs940550 4 78032 0 T +6 rs940550 5 81468 G C +8 rs2949420 6 222077 A G diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index 4af424a..afee1d0 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -331,17 +331,16 @@ def test_get_nb_samples_w_mode(self): p.get_nb_samples() self.assertEqual("not available in 'w' mode", str(cm.exception)) - # TODO: Change some chromosomes and cm just to be sure def test_get_bim(self): """Tests the 'get_bim' function.""" # The original BIM file (with the 'i' column) ori_bim = self.pedfile._bim # The expected values - chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8] positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032, 81468, 222077] - cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6] a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"] a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"] @@ -1094,13 +1093,13 @@ def test_dup_markers(self, pyplink_logger): self.assertTrue(pyplink_logger.warning.called) # Checking the BIM - chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8] markers = ["rs10399749", "rs2949420:dup1", "rs2949421", "rs2691310", "rs4030303:dup1", "rs4030303:dup2", "rs4030303:dup3", "rs940550:dup1", "rs940550:dup2", "rs2949420:dup2"] positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032, 81468, 222077] - cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6] a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"] a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"] From 43d24d62ad0e2c902aaefb8d118870f725b8a4ae Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 16:26:22 -0500 Subject: [PATCH 09/18] Added mock at installation --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 33cf18c..d90e449 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ install: - pip install six - pip install numpy - pip install pandas + - pip install mock - pip install coveralls script: - coverage run setup.py test From 67f63f34d5a5ef1e83d30294a7801b1dd57d0c10 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Thu, 9 Mar 2017 16:32:37 -0500 Subject: [PATCH 10/18] Travis... --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d90e449..7668ff0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install six - pip install numpy - pip install pandas - - pip install mock + - pip install -U mock - pip install coveralls script: - coverage run setup.py test From d1c177778471be79b2cc4632f97a9706362ef5e6 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:09:19 -0500 Subject: [PATCH 11/18] Updated README --- README.mkd | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.mkd b/README.mkd index d9d0e65..4e194f4 100644 --- a/README.mkd +++ b/README.mkd @@ -67,12 +67,11 @@ conda update pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink To test the module, just perform the following command: -```python ->>> import pyplink ->>> pyplink.test() +```console +$ python -m pyplink.tests ............................................. ---------------------------------------------------------------------- -Ran 45 tests in 0.480s +Ran 45 tests in 0.334s OK ``` From 3cc9f6b0fa077ece50e189a94cc0ed120e0bc81f Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:09:39 -0500 Subject: [PATCH 12/18] Better docstrings --- pyplink/__init__.py | 4 +- pyplink/pyplink.py | 168 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 137 insertions(+), 35 deletions(-) diff --git a/pyplink/__init__.py b/pyplink/__init__.py index 39c8ed0..dc8f1b0 100644 --- a/pyplink/__init__.py +++ b/pyplink/__init__.py @@ -25,7 +25,7 @@ # THE SOFTWARE. -from .pyplink import * +from .pyplink import PyPlink try: from .version import pyplink_version as __version__ @@ -46,7 +46,7 @@ def test(verbosity=1): # pragma: no cover """Executes all the tests for pyplink. Args: - verbosity (int): the verbosity level for unittest + verbosity (int): The verbosity level for :py:mod:`unittest`. Just set ``verbosity`` to an integer higher than ``1`` to have more information about the tests. diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 8bda865..6c23c9b 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -63,7 +63,16 @@ class PyPlink(object): - """Reads and store a set of binary Plink files.""" + """Reads and store a set of binary Plink files. + + Args: + prefix (str): The prefix of the binary Plink files. + mode (str): The open mode for the binary Plink file. + bed_format (str): The type of bed (SNP-major or INDIVIDUAL-major). + + Reads or write binary Plink files (BED, BIM and FAM). + + """ # The genotypes values _geno_values = np.array( @@ -74,18 +83,8 @@ class PyPlink(object): dtype=np.int8, ) - def __init__(self, i_prefix, mode="r", bed_format="SNP-major"): - """Initializes a new PyPlink object. - - Args: - i_prefix (str): the prefix of the binary Plink files - mode (str): the open mode for the binary Plink file - nb_samples (int): the number of samples - bed_format (str): the type of bed (SNP-major or INDIVIDUAL-major) - - Reads or write binary Plink files (BED, BIM and FAM). - - """ + def __init__(self, prefix, mode="r", bed_format="SNP-major"): + """Initializes a new PyPlink instance.""" # The mode self._mode = mode @@ -95,9 +94,9 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"): self._bed_format = bed_format # These are the name of the files - self.bed_filename = "{}.bed".format(i_prefix) - self.bim_filename = "{}.bim".format(i_prefix) - self.fam_filename = "{}.fam".format(i_prefix) + self.bed_filename = "{}.bed".format(prefix) + self.bim_filename = "{}.bim".format(prefix) + self.fam_filename = "{}.fam".format(prefix) if self._mode == "r": if self._bed_format != "SNP-major": @@ -163,12 +162,18 @@ def __exit__(self, *args): self.close() def close(self): - """Closes the BED file if required.""" + """Closes the BED file.""" # Closing the BED file self._bed.close() def next(self): - """The next function.""" + """Returns the next marker. + + Returns: + tuple: The marker name as a string and its genotypes as a + :py:class:`numpy.ndarray`. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -185,7 +190,12 @@ def _read_current_marker(self): ].flatten(order="C")[:self._nb_samples] def seek(self, n): - """Gets to a certain position in the BED file when iterating.""" + """Gets to a certain marker position in the BED file. + + Args: + n (int): The index of the marker to seek to. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -198,7 +208,12 @@ def seek(self, n): raise ValueError("invalid position in BED: {}".format(n)) def _get_seek_position(self, n): - """Gets the seek position in the file (including special bytes).""" + """Gets the seek position in the file (including special bytes). + + Args: + n (int): The index of the marker to seek to. + + """ return 3 + self._nb_bytes * n def _read_bim(self): @@ -260,21 +275,36 @@ def _read_bim(self): self._nb_markers = self._bim.shape[0] def get_bim(self): - """Returns the BIM file.""" + """Returns the BIM file. + + Returns: + pandas.DataFrame: The BIM file. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._bim.drop("i", axis=1) def get_nb_markers(self): - """Returns the number of markers.""" + """Returns the number of markers. + + Returns: + int: The number of markers in the dataset. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._nb_markers def get_duplicated_markers(self): - """Returns the duplicated markers, if any.""" + """Returns the duplicated markers, if any. + + Args: + set: The set of duplicated marker (might be empty). + + """ if self._has_duplicated: return self._dup_markers else: @@ -299,14 +329,24 @@ def _read_fam(self): self._nb_samples = self._fam.shape[0] def get_fam(self): - """Returns the FAM file.""" + """Returns the FAM file. + + Returns: + pandas.DataFrame: The FAM file. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") return self._fam.drop(["byte", "bit"], axis=1) def get_nb_samples(self): - """Returns the number of samples.""" + """Returns the number of samples. + + Returns: + int: The number of samples in the dataset. + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -354,7 +394,13 @@ def _write_bed_header(self): self._bed.write(bytearray((108, 27, final_byte))) def iter_geno(self): - """Iterates over genotypes from the beginning.""" + """Iterates over genotypes from the beginning of the BED file. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -365,13 +411,29 @@ def iter_geno(self): return self def iter_acgt_geno(self): - """Iterates over genotypes (ACGT format).""" + """Iterates over genotypes (ACGT format). + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (ACGT format). + + """ # Need to iterate over itself, and modify the actual genotypes for i, (marker, geno) in enumerate(self.iter_geno()): yield marker, self._allele_encoding[i][geno] def iter_geno_marker(self, markers, return_seek=False): - """Iterates over genotypes for given markers.""" + """Iterates over genotypes for a list of markers. + + Args: + markers (list): The list of markers to iterate onto. + return_seek (bool): Wether to return the marker's index or not. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -389,14 +451,32 @@ def iter_geno_marker(self, markers, return_seek=False): yield marker, self.get_geno_marker(marker) def iter_acgt_geno_marker(self, markers): - """Iterates over genotypes for given markers (ACGT format).""" + """Iterates over genotypes for a list of markers (ACGT format). + + Args: + markers (list): The list of markers to iterate onto. + + Returns: + tuple: The name of the marker as a string, and its genotypes as a + :py:class:`numpy.ndarray` (ACGT format). + + """ # We iterate over the markers for snp, geno, s in self.iter_geno_marker(markers, return_seek=True): # Getting the SNP position and converting to ACGT yield snp, self._allele_encoding[s][geno] def get_geno_marker(self, marker, return_seek=False): - """Gets the genotypes for a given marker.""" + """Gets the genotypes for a given marker. + + Args: + marker (str): The name of the marker. + return_seek (bool): Wether to return the marker's index or not. + + Returns: + numpy.ndarray: The genotypes of the marker (additive format). + + """ if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -413,7 +493,15 @@ def get_geno_marker(self, marker, return_seek=False): return self._read_current_marker() def get_acgt_geno_marker(self, marker): - """Gets the genotypes for a given marker (ACGT format).""" + """Gets the genotypes for a given marker (ACGT format). + + Args: + marker (str): The name of the marker. + + Returns: + numpy.ndarray: The genotypes of the marker (ACGT format). + + """ # Getting the marker's genotypes geno, snp_position = self.get_geno_marker(marker, return_seek=True) @@ -421,7 +509,12 @@ def get_acgt_geno_marker(self, marker): return self._allele_encoding[snp_position][geno] def write_genotypes(self, genotypes): - """Write genotypes to binary file.""" + """Write genotypes to binary file. + + Args: + genotypes (numpy.ndarray): The genotypes to write in the BED file. + + """ if self._mode != "w": raise UnsupportedOperation("not available in 'r' mode") @@ -445,6 +538,15 @@ def write_genotypes(self, genotypes): @staticmethod def _grouper(iterable, n, fillvalue=0): - """Collect data into fixed-length chunks or blocks.""" + """Collect data into fixed-length chunks or blocks. + + Args: + n (int): The size of the chunk. + fillvalue (int): The fill value. + + Returns: + iterator: An iterator over the chunks. + + """ args = [iter(iterable)] * n return zip_longest(fillvalue=fillvalue, *args) From b0e81d445f25a90194a9d601ccbed4dc8da664ab Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:10:04 -0500 Subject: [PATCH 13/18] Now possible to execute tests using 'python -m pyplink.tests'. --- pyplink/tests/__main__.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 pyplink/tests/__main__.py diff --git a/pyplink/tests/__main__.py b/pyplink/tests/__main__.py new file mode 100644 index 0000000..96cdc50 --- /dev/null +++ b/pyplink/tests/__main__.py @@ -0,0 +1,36 @@ +# This file is part of pyplink. +# +# The MIT License (MIT) +# +# Copyright (c) 2014 Louis-Philippe Lemieux Perreault +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +import unittest + +from . import test_suite + + +__author__ = "Louis-Philippe Lemieux Perreault" +__copyright__ = "Copyright 2014 Louis-Philippe Lemieux Perreault" +__license__ = "MIT" + + +unittest.TextTestRunner(verbosity=1).run(test_suite) From c0d5d9a1c0df605f093079d89f1a1c45893a05da Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:10:57 -0500 Subject: [PATCH 14/18] Added a small documentation --- .gitignore | 1 + docs/Makefile | 20 +++++ docs/conf.py | 186 ++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 39 +++++++++ docs/installation.rst | 30 +++++++ docs/pyplink.rst | 11 +++ 6 files changed, 287 insertions(+) create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/installation.rst create mode 100644 docs/pyplink.rst diff --git a/.gitignore b/.gitignore index bb6d856..833249c 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ htmlcov .ipynb_checkpoints build +_build diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..bcc041b --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = pyplink +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..b2a1082 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# pyplink documentation build configuration file, created by +# sphinx-quickstart on Fri Mar 10 09:00:31 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'pyplink' +copyright = '2017, Louis-Philippe Lemieux Perreault' +author = 'Louis-Philippe Lemieux Perreault' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pyplink +version = ".".join(pyplink.__version__.split(".")[:-1]) +release = pyplink.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. + +html_theme_options = { + 'github_user': 'lemieuxl', + 'github_repo': 'pyplink', + 'github_button': False, + 'fixed_sidebar': True, +} + +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'relations.html', + 'searchbox.html', + 'donate.html', + ] +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pyplinkdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pyplink.tex', 'pyplink Documentation', + 'Louis-Philippe Lemieux Perreault', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pyplink', 'pyplink Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pyplink', 'pyplink Documentation', + author, 'pyplink', 'One line description of project.', + 'Miscellaneous'), +] + + + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'https://docs.python.org/3': None, + 'numpy': ('http://docs.scipy.org/doc/numpy/', None), + 'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None), +} diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..b9aca61 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,39 @@ +.. pyplink documentation master file, created by + sphinx-quickstart on Fri Mar 10 09:00:31 2017. + +pyplink +======== + +:py:mod:`pyplink` is a Python (2 and 3) binary Plink file parser and writer +released under the MIT licence. The difference with existing parsers (and Plink +itself) is that :py:mod:`pyplink` does not load the BED file in memory, making +possible to work with extremely large files (*e.g.* 1000 Genomes Phase 3 +files). + +For more information on how to use :py:mod:`pyplink`, refer to the +:doc:`API documentation `. Below is a snippet describing the most +common usage of the module. + + +.. code-block:: python + + from pyplink import PyPlink + + with PyPlink("plink_file_prefix") as bed: + # Getting the BIM and FAM + bim = bed.get_bim() + fam = bed.get_fam() + + # Iterating over all loci + for loci_name, genotypes in bed: + pass + + # Getting the genotypes of a single marker (numpy.ndarray) + genotypes = bed.get_geno_marker("rs12345") + + +.. toctree:: + :hidden: + + installation + pyplink diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..c7951dc --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,30 @@ + +Installation +============= + +You can install :py:mod:`pyplink` using either ``pip`` or ``conda``. + +.. code-block:: bash + + # Using pip + pip install pyplink + + +.. code-block:: bash + + # Using conda + conda config --add channels http://statgen.org/wp-content/uploads/Softwares/pyplink + conda install pyplink + + +To test the installation, run ``python`` and execute the following commands. + +.. code-block:: python + + >>> import pyplink + >>> pyplink.test() + ............................................. + ---------------------------------------------------------------------- + Ran 45 tests in 0.343s + + OK diff --git a/docs/pyplink.rst b/docs/pyplink.rst new file mode 100644 index 0000000..f9315a3 --- /dev/null +++ b/docs/pyplink.rst @@ -0,0 +1,11 @@ + +pyplink's API +============== + +.. autoclass:: pyplink.PyPlink + :members: + + +.. automodule:: pyplink + :members: + From 7a5612de0b2b67a42e005966f775dfa532fac5ca Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:20:57 -0500 Subject: [PATCH 15/18] Added a small snippet in the docstring --- pyplink/pyplink.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 6c23c9b..1839d46 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -72,6 +72,18 @@ class PyPlink(object): Reads or write binary Plink files (BED, BIM and FAM). + .. code-block:: python + + from pyplink import PyPlink + + # Reading BED files + with PyPlink("plink_file_prefix") as bed: + pass + + # Writing BED files + with PyPlink("plink_file_prefix", "w") as bed: + pass + """ # The genotypes values From 99d8c6079019035ec3028219cd4906196e40e458 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 11:25:38 -0500 Subject: [PATCH 16/18] Changed return_seek for return_index (made more sense) --- pyplink/pyplink.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 1839d46..932d6c5 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -387,8 +387,8 @@ def _read_bed(self): "{}".format(self.bed_filename)) # Checking the last entry (for BED corruption) - seek_position = self._get_seek_position(self._bim.iloc[-1, :].i) - bed_file.seek(seek_position) + seek_index = self._get_seek_position(self._bim.iloc[-1, :].i) + bed_file.seek(seek_index) geno = self._geno_values[ np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8) ].flatten(order="C")[:self._nb_samples] @@ -434,12 +434,12 @@ def iter_acgt_geno(self): for i, (marker, geno) in enumerate(self.iter_geno()): yield marker, self._allele_encoding[i][geno] - def iter_geno_marker(self, markers, return_seek=False): + def iter_geno_marker(self, markers, return_index=False): """Iterates over genotypes for a list of markers. Args: markers (list): The list of markers to iterate onto. - return_seek (bool): Wether to return the marker's index or not. + return_index (bool): Wether to return the marker's index or not. Returns: tuple: The name of the marker as a string, and its genotypes as a @@ -454,9 +454,9 @@ def iter_geno_marker(self, markers, return_seek=False): markers = [markers] # Iterating over all markers - if return_seek: + if return_index: for marker in markers: - geno, seek = self.get_geno_marker(marker, return_seek=True) + geno, seek = self.get_geno_marker(marker, return_index=True) yield marker, geno, seek else: for marker in markers: @@ -474,16 +474,16 @@ def iter_acgt_geno_marker(self, markers): """ # We iterate over the markers - for snp, geno, s in self.iter_geno_marker(markers, return_seek=True): + for snp, geno, s in self.iter_geno_marker(markers, return_index=True): # Getting the SNP position and converting to ACGT yield snp, self._allele_encoding[s][geno] - def get_geno_marker(self, marker, return_seek=False): + def get_geno_marker(self, marker, return_index=False): """Gets the genotypes for a given marker. Args: marker (str): The name of the marker. - return_seek (bool): Wether to return the marker's index or not. + return_index (bool): Wether to return the marker's index or not. Returns: numpy.ndarray: The genotypes of the marker (additive format). @@ -497,11 +497,11 @@ def get_geno_marker(self, marker, return_seek=False): raise ValueError("{}: marker not in BIM".format(marker)) # Seeking to the correct position - seek_position = self._bim.loc[marker, "i"] - self.seek(seek_position) + seek_index = self._bim.loc[marker, "i"] + self.seek(seek_index) - if return_seek: - return self._read_current_marker(), seek_position + if return_index: + return self._read_current_marker(), seek_index return self._read_current_marker() def get_acgt_geno_marker(self, marker): @@ -515,7 +515,7 @@ def get_acgt_geno_marker(self, marker): """ # Getting the marker's genotypes - geno, snp_position = self.get_geno_marker(marker, return_seek=True) + geno, snp_position = self.get_geno_marker(marker, return_index=True) # Returning the ACGT's format of the genotypes return self._allele_encoding[snp_position][geno] From a1361062b6fc309371ca1535be851c9e560e06b1 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 12:49:31 -0500 Subject: [PATCH 17/18] Added the link to the short documentation --- README.mkd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.mkd b/README.mkd index 4e194f4..08c125d 100644 --- a/README.mkd +++ b/README.mkd @@ -5,7 +5,9 @@ # pyplink - Module to process Plink's binary files -`PyPlink` is a Python module to read and write Plink's binary files. +`PyPlink` is a Python module to read and write Plink's binary files. Short +documentation available at +[https://lemieuxl.github.io/pyplink/](https://lemieuxl.github.io/pyplink/). ## Dependencies From d8086c41111aae066f1961d52b2fdeef581af9e1 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 10 Mar 2017 12:50:40 -0500 Subject: [PATCH 18/18] For next release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4589db7..1592560 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ MAJOR = 1 MINOR = 3 -MICRO = 1 +MICRO = 2 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO)