From 999503071ff31ff1c32dd91ab7e74633dcd27f4c Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Wed, 8 Mar 2017 14:10:22 -0500
Subject: [PATCH 01/18] Removed the deprecation warning

---
 pyplink/pyplink.py            | 12 +-----------
 pyplink/tests/test_pyplink.py | 13 -------------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index f09e5ad..0a90766 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -26,7 +26,6 @@
 
 
 import os
-import warnings
 from io import UnsupportedOperation
 
 try:
@@ -48,10 +47,6 @@
 __all__ = ["PyPlink"]
 
 
-# Allowing for warnings
-warnings.simplefilter("once", DeprecationWarning)
-
-
 # The recoding values
 _geno_recode = {1: -1,  # Unknown genotype
                 2:  1,  # Heterozygous genotype
@@ -217,7 +212,7 @@ def _read_bim(self):
         bim[0] = bim.a2 * 2           # Original '3'
         bim[-1] = "00"                # Original 1
 
-        # Testing something
+        # Decoding the allele
         allele_encoding = np.array(
             [bim[0], bim[1], bim[2], bim[-1]],
             dtype="U2",
@@ -387,11 +382,6 @@ def get_acgt_geno_marker(self, marker):
         # Returning the ACGT's format of the genotypes
         return self._allele_encoding[snp_position][geno]
 
-    def write_marker(self, genotypes):
-        """Deprecated function."""
-        warnings.warn("deprecated: use 'write_genotypes'", DeprecationWarning)
-        self.write_genotypes(genotypes)
-
     def write_genotypes(self, genotypes):
         """Write genotypes to binary file."""
         if self._mode != "w":
diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index 254de22..4e29f16 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -33,7 +33,6 @@
 import zipfile
 import platform
 import unittest
-import warnings
 from tempfile import mkdtemp
 from io import UnsupportedOperation
 from distutils.spawn import find_executable
@@ -1052,15 +1051,3 @@ def test_write_genotypes_in_r_mode(self):
         with self.assertRaises(UnsupportedOperation) as cm:
             self.pedfile.write_genotypes([0, 0, 0])
         self.assertEqual("not available in 'r' mode", str(cm.exception))
-
-    def test_write_marker_deprecation_warning(self):
-        """Tests that a deprecation warning is triggered."""
-        with warnings.catch_warnings(record=True) as w:
-            with PyPlink(os.path.join(self.tmp_dir, "test_warns"), "w") as p:
-                p.write_marker([0, 0, 0])
-            self.assertEqual(1, len(w))
-            self.assertTrue(issubclass(w[0].category, DeprecationWarning))
-            self.assertEqual(
-                "deprecated: use 'write_genotypes'",
-                str(w[0].message),
-            )

From 69a614d15d9bfa7ed6e3e69b2f71b63b9510fed4 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 12:56:28 -0500
Subject: [PATCH 02/18] Better way to make sure that snp name, alleles and
 sample IDs are always string

---
 pyplink/pyplink.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 0a90766..4d3154e 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -26,6 +26,7 @@
 
 
 import os
+import logging
 from io import UnsupportedOperation
 
 try:
@@ -47,6 +48,10 @@
 __all__ = ["PyPlink"]
 
 
+# The logger
+logger = logging.getLogger(__name__)
+
+
 # The recoding values
 _geno_recode = {1: -1,  # Unknown genotype
                 2:  1,  # Heterozygous genotype
@@ -198,12 +203,8 @@ def _read_bim(self):
         """Reads the BIM file."""
         # Reading the BIM file and setting the values
         bim = pd.read_csv(self.bim_filename, delim_whitespace=True,
-                          names=["chrom", "snp", "cm", "pos", "a1", "a2"])
-
-        # The 'snp', 'a1' and 'a2' columns should always be strings
-        bim["snp"] = bim["snp"].astype(str)
-        bim["a1"] = bim["a1"].astype(str)
-        bim["a2"] = bim["a2"].astype(str)
+                          names=["chrom", "snp", "cm", "pos", "a1", "a2"],
+                          dtype=dict(snp=str, a1=str, a2=str))
 
         bim = bim.set_index("snp")
         bim["i"] = range(bim.shape[0])
@@ -243,14 +244,10 @@ def _read_fam(self):
         # Reading the FAM file and setting the values
         fam = pd.read_csv(self.fam_filename, delim_whitespace=True,
                           names=["fid", "iid", "father", "mother", "gender",
-                                 "status"])
-
-        # The sample IDs should always be strings (more logical that way)
-        fam["fid"] = fam["fid"].astype(str)
-        fam["iid"] = fam["iid"].astype(str)
-        fam["father"] = fam["father"].astype(str)
-        fam["mother"] = fam["mother"].astype(str)
+                                 "status"],
+                          dtype=dict(fid=str, iid=str, father=str, mother=str))
 
+        # Getting the byte and bit location of each samples
         fam["byte"] = [
             int(np.ceil((1 + 1) / 4.0)) - 1 for i in range(len(fam))
         ]

From 41db621799c637943c5ce942cdd96e1a8046fbe4 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 13:17:19 -0500
Subject: [PATCH 03/18] Different way to create the encode key to increase
 speed

---
 pyplink/pyplink.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 4d3154e..dc2b6d3 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -27,6 +27,7 @@
 
 import os
 import logging
+from itertools import repeat
 from io import UnsupportedOperation
 
 try:
@@ -206,16 +207,20 @@ def _read_bim(self):
                           names=["chrom", "snp", "cm", "pos", "a1", "a2"],
                           dtype=dict(snp=str, a1=str, a2=str))
 
+        # Saving the index as integer
+        bim["i"] = bim.index
+
+        # Checking for duplicated markers
         bim = bim.set_index("snp")
-        bim["i"] = range(bim.shape[0])
-        bim[2] = bim.a1 * 2           # Original '0'
-        bim[1] = bim.a1 + bim.a2      # Original '2'
-        bim[0] = bim.a2 * 2           # Original '3'
-        bim[-1] = "00"                # Original 1
 
-        # Decoding the allele
+        # Encoding the allele
+        #   - The original 0 is the actual 2 (a1/a1)
+        #   - The original 2 is the actual 1 (a1/a2)
+        #   - The original 3 is the actual 0 (a2/a2)
+        #   - The original 1 is the actual -1 (no call)
         allele_encoding = np.array(
-            [bim[0], bim[1], bim[2], bim[-1]],
+            [bim.a2 * 2, bim.a1 + bim.a2, bim.a1 * 2,
+             list(repeat("00", bim.shape[0]))],
             dtype="U2",
         )
         self._allele_encoding = allele_encoding.T

From 74663308f1302d339d648fffe8301f2bd1fd9232 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 13:19:57 -0500
Subject: [PATCH 04/18] No need for the list markers (already in the index)

---
 pyplink/pyplink.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index dc2b6d3..27088c8 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -175,7 +175,7 @@ def next(self):
         if self._n > self._nb_markers:
             raise StopIteration()
 
-        return self._markers[self._n - 1], self._read_current_marker()
+        return self._bim.index[self._n - 1], self._read_current_marker()
 
     def _read_current_marker(self):
         """Reads the current marker and returns its genotypes."""
@@ -228,7 +228,6 @@ def _read_bim(self):
         # Saving the data in the object
         self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]]
         self._nb_markers = self._bim.shape[0]
-        self._markers = self._bim.index.values
 
     def get_bim(self):
         """Returns the BIM file."""

From a01a44cb930db491d16a2dc938ad40f4c32db8a2 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 15:01:23 -0500
Subject: [PATCH 05/18] Fixes #2: Can now work with duplicated markers

---
 pyplink/pyplink.py | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 27088c8..8bda865 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -28,6 +28,7 @@
 import os
 import logging
 from itertools import repeat
+from collections import Counter
 from io import UnsupportedOperation
 
 try:
@@ -211,7 +212,36 @@ def _read_bim(self):
         bim["i"] = bim.index
 
         # Checking for duplicated markers
-        bim = bim.set_index("snp")
+        try:
+            bim = bim.set_index("snp", verify_integrity=True)
+            self._has_duplicated = False
+
+        except ValueError as e:
+            # Setting this flag to true
+            self._has_duplicated = True
+
+            # Finding the duplicated markers
+            duplicated = bim.snp.duplicated(keep=False)
+            duplicated_markers = bim.loc[duplicated, "snp"]
+            duplicated_marker_counts = duplicated_markers.value_counts()
+            self._dup_markers = set(duplicated_marker_counts.index)
+
+            # Logging a warning
+            logger.warning("Duplicated markers found")
+            for marker, count in duplicated_marker_counts.iteritems():
+                logger.warning("  - {}: {:,d} times".format(marker, count))
+            logger.warning("Appending ':dupX' to the duplicated markers "
+                           "according to their location in the BIM file")
+
+            # Renaming the markers
+            counter = Counter()
+            for i, marker in duplicated_markers.iteritems():
+                counter[marker] += 1
+                new_name = "{}:dup{}".format(marker, counter[marker])
+                bim.loc[i, "snp"] = new_name
+
+            # Resetting the index
+            bim = bim.set_index("snp", verify_integrity=True)
 
         # Encoding the allele
         #   - The original 0 is the actual 2 (a1/a1)
@@ -243,6 +273,13 @@ def get_nb_markers(self):
 
         return self._nb_markers
 
+    def get_duplicated_markers(self):
+        """Returns the duplicated markers, if any."""
+        if self._has_duplicated:
+            return self._dup_markers
+        else:
+            return set()
+
     def _read_fam(self):
         """Reads the FAM file."""
         # Reading the FAM file and setting the values
@@ -297,7 +334,7 @@ def _read_bed(self):
                 raise ValueError("not in SNP-major format (please recode): "
                                  "{}".format(self.bed_filename))
 
-            # Checking the last entry
+            # Checking the last entry (for BED corruption)
             seek_position = self._get_seek_position(self._bim.iloc[-1, :].i)
             bed_file.seek(seek_position)
             geno = self._geno_values[

From 820e1bd1c7e13e84c0967b31b09141aca177437a Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 16:10:38 -0500
Subject: [PATCH 06/18] Added a test for the duplicated markers

---
 pyplink/tests/data/test_data_with_dup.bim |  10 ++
 pyplink/tests/test_pyplink.py             | 152 +++++++++++++++++-----
 2 files changed, 129 insertions(+), 33 deletions(-)
 create mode 100644 pyplink/tests/data/test_data_with_dup.bim

diff --git a/pyplink/tests/data/test_data_with_dup.bim b/pyplink/tests/data/test_data_with_dup.bim
new file mode 100644
index 0000000..df83cbe
--- /dev/null
+++ b/pyplink/tests/data/test_data_with_dup.bim
@@ -0,0 +1,10 @@
+1	rs10399749	0	45162	G	C
+1	rs2949420	0	45257	C	T
+1	rs2949421	0	45413	0	0
+1	rs2691310	0	46844	A	T
+1	rs4030303	0	72434	0	G
+1	rs4030303	0	72515	0	C
+1	rs4030303	0	77689	G	A
+1	rs940550	0	78032	0	T
+1	rs940550	0	81468	G	C
+1	rs2949420	0	222077	A	G
diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index 4e29f16..4af424a 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -48,6 +48,11 @@
 except ImportError:
     from urllib import urlretrieve
 
+try:
+    from unittest import mock
+except ImportError:
+    import mock
+
 from pkg_resources import resource_filename
 
 import numpy as np
@@ -55,7 +60,7 @@
 
 from six.moves import range
 
-from ..pyplink import PyPlink
+from .. import pyplink
 
 
 def get_plink(tmp_dir):
@@ -170,7 +175,7 @@ def setUpClass(cls):
 
     def setUp(self):
         # Reading the plink binary file
-        self.pedfile = PyPlink(self.prefix)
+        self.pedfile = pyplink.PyPlink(self.prefix)
 
     @classmethod
     def tearDownClass(cls):
@@ -243,7 +248,7 @@ def test_pyplink_bad_bed(self):
 
         # This should raise an exception
         with self.assertRaises(ValueError) as cm:
-            PyPlink(new_prefix)
+            pyplink.PyPlink(new_prefix)
         self.assertEqual("invalid number of entries: corrupted BED?",
                          str(cm.exception))
 
@@ -254,7 +259,7 @@ def test_pyplink_bad_bed(self):
 
         # This should raise an exception
         with self.assertRaises(ValueError) as cm:
-            PyPlink(new_prefix)
+            pyplink.PyPlink(new_prefix)
         self.assertEqual("not a valid BED file: {}".format(new_bed),
                          str(cm.exception))
 
@@ -265,7 +270,7 @@ def test_pyplink_bad_bed(self):
 
         # This should raise an exception
         with self.assertRaises(ValueError) as cm:
-            PyPlink(new_prefix)
+            pyplink.PyPlink(new_prefix)
         self.assertEqual("not a valid BED file: {}".format(new_bed),
                          str(cm.exception))
 
@@ -276,7 +281,7 @@ def test_pyplink_bad_bed(self):
 
         # This should raise an exception
         with self.assertRaises(ValueError) as cm:
-            PyPlink(new_prefix)
+            pyplink.PyPlink(new_prefix)
         self.assertEqual(
             "not in SNP-major format (please recode): {}".format(new_bed),
             str(cm.exception),
@@ -294,7 +299,7 @@ def test_missing_files(self):
         for extension in (".bed", ".bim", ".fam"):
             os.remove(prefix + extension)
             with self.assertRaises(IOError) as cm:
-                PyPlink(prefix)
+                pyplink.PyPlink(prefix)
             self.assertEqual("No such file: '{}'".format(prefix + extension),
                              str(cm.exception))
             with open(prefix + extension, "w"):
@@ -308,7 +313,8 @@ def test_get_nb_markers_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_nb_markers()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -320,10 +326,12 @@ def test_get_nb_samples_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_nb_samples()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
+    # TODO: Change some chromosomes and cm just to be sure
     def test_get_bim(self):
         """Tests the 'get_bim' function."""
         # The original BIM file (with the 'i' column)
@@ -374,7 +382,8 @@ def test_get_bim_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_bim()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -431,7 +440,8 @@ def test_get_fam_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_fam()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -454,7 +464,8 @@ def test_generator_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 marker, genotypes = next(p)
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -470,7 +481,8 @@ def test_next_w_mode(self):
         """Tests that an exception is raised when calling next in w mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.next()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -532,7 +544,8 @@ def test_seek_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.seek(100)
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -550,7 +563,8 @@ def test_iter_geno_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 marker, genotypes = next(p.iter_geno())
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -568,7 +582,8 @@ def test_iter_acgt_geno_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 marker, genotypes = next(p.iter_acgt_geno())
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -608,7 +623,8 @@ def test_iter_geno_marker_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 marker, genotypes = next(p.iter_geno_marker(["M1", "M2"]))
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -648,7 +664,8 @@ def test_iter_acgt_geno_marker_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 marker, genotypes = next(p.iter_acgt_geno_marker(["M1", "M2"]))
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -680,7 +697,8 @@ def test_repr_w_mode(self):
         e_repr = 'PyPlink(mode="w")'
 
         # Creating the dummy PyPlink object
-        with PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") as pedfile:
+        prefix = os.path.join(self.tmp_dir, "test_repr")
+        with pyplink.PyPlink(prefix, "w") as pedfile:
             # Comparing the expected with the observed representation
             o_repr = str(pedfile)
             self.assertEqual(e_repr, o_repr)
@@ -708,7 +726,8 @@ def test_get_geno_marker_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_geno_marker("M1")
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -716,7 +735,8 @@ def test_get_iter_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 iter(p)
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
@@ -740,19 +760,20 @@ def test_get_acgt_geno_marker_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
-            with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
+            prefix = os.path.join(self.tmp_dir, "test_error")
+            with pyplink.PyPlink(prefix, "w") as p:
                 p.get_acgt_geno_marker("M1")
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
     def test_get_context_read_mode(self):
         """Tests the PyPlink object as context manager."""
-        with PyPlink(self.prefix) as genotypes:
+        with pyplink.PyPlink(self.prefix) as genotypes:
             self.assertEqual(3, len(genotypes.get_fam().head(n=3)))
 
     def test_invalid_mode(self):
         """Tests invalid mode when PyPlink as context manager."""
         with self.assertRaises(ValueError) as cm:
-            PyPlink(self.prefix, "u")
+            pyplink.PyPlink(self.prefix, "u")
         self.assertEqual("invalid mode: 'u'", str(cm.exception))
 
     def test_write_binary(self):
@@ -768,7 +789,7 @@ def test_write_binary(self):
         test_prefix = os.path.join(self.tmp_dir, "test_write")
 
         # Writing the binary file
-        with PyPlink(test_prefix, "w") as pedfile:
+        with pyplink.PyPlink(test_prefix, "w") as pedfile:
             for genotypes in expected_genotypes:
                 pedfile.write_genotypes(genotypes)
 
@@ -788,7 +809,7 @@ def test_write_binary(self):
                       file=o_file)
 
         # Reading the written binary file
-        with PyPlink(test_prefix) as pedfile:
+        with pyplink.PyPlink(test_prefix) as pedfile:
             for i, (marker, genotypes) in enumerate(pedfile):
                 self.assertEqual("m{}".format(i+1), marker)
                 np.testing.assert_array_equal(expected_genotypes[i], genotypes)
@@ -807,7 +828,7 @@ def test_write_binary_error(self):
 
         # Writing the binary file
         with self.assertRaises(ValueError) as cm:
-            with PyPlink(test_prefix, "w") as pedfile:
+            with pyplink.PyPlink(test_prefix, "w") as pedfile:
                 pedfile.write_genotypes(expected_genotypes[0])  # 7 genotypes
                 pedfile.write_genotypes(expected_genotypes[1])  # 6 genotypes
         self.assertEqual("7 samples expected, got 6", str(cm.exception))
@@ -820,7 +841,7 @@ def test_grouper_padding(self):
             (6, 7, 8),
             (9, 0, 0),
         ]
-        observed_chunks = PyPlink._grouper(range(10), 3)
+        observed_chunks = pyplink.PyPlink._grouper(range(10), 3)
         for expected, observed in zip(expected_chunks, observed_chunks):
             self.assertEqual(expected, observed)
 
@@ -830,7 +851,7 @@ def test_grouper_no_padding(self):
             (0, 1, 2, 3, 4),
             (5, 6, 7, 8, 9),
         ]
-        observed_chunks = PyPlink._grouper(range(10), 5)
+        observed_chunks = pyplink.PyPlink._grouper(range(10), 5)
         for expected, observed in zip(expected_chunks, observed_chunks):
             self.assertEqual(expected, observed)
 
@@ -849,7 +870,7 @@ def test_with_plink(self):
             [0, 0, 0, 0,  0, 1, 0, 0, 0, 0],
         ]
         prefix = os.path.join(self.tmp_dir, "test_output")
-        with PyPlink(prefix, "w") as pedfile:
+        with pyplink.PyPlink(prefix, "w") as pedfile:
             for genotypes in all_genotypes:
                 pedfile.write_genotypes(genotypes)
 
@@ -949,7 +970,7 @@ def test_with_plink_individual_major(self):
 
         # Creating the BED file (INDIVIDUAL-major)
         prefix = os.path.join(self.tmp_dir, "test_output")
-        with PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile:
+        with pyplink.PyPlink(prefix, "w", "INDIVIDUAL-major") as pedfile:
             for genotypes in transposed_genotypes:
                 pedfile.write_genotypes(genotypes)
 
@@ -1031,7 +1052,7 @@ def test_with_plink_individual_major(self):
     def test_wrong_bed_format(self):
         """Tests opening a BED file with unknown format."""
         with self.assertRaises(ValueError) as cm:
-            PyPlink(self.prefix, bed_format="UNKNOWN-major")
+            pyplink.PyPlink(self.prefix, bed_format="UNKNOWN-major")
         self.assertEqual(
             "invalid bed format: UNKNOWN-major",
             str(cm.exception),
@@ -1040,7 +1061,7 @@ def test_wrong_bed_format(self):
     def test_invalid_bed_format_with_r_mode(self):
         """Tests an invalid BED format with r mode."""
         with self.assertRaises(ValueError) as cm:
-            PyPlink(self.prefix, bed_format="INDIVIDUAL-major")
+            pyplink.PyPlink(self.prefix, bed_format="INDIVIDUAL-major")
         self.assertEqual(
             "only SNP-major format is supported with mode 'r'",
             str(cm.exception),
@@ -1051,3 +1072,68 @@ def test_write_genotypes_in_r_mode(self):
         with self.assertRaises(UnsupportedOperation) as cm:
             self.pedfile.write_genotypes([0, 0, 0])
         self.assertEqual("not available in 'r' mode", str(cm.exception))
+
+    @mock.patch.object(pyplink, "logger")
+    def test_dup_markers(self, pyplink_logger):
+        """Tests when there are duplicated markers."""
+        # Checking the original one has no duplicates
+        self.assertEqual(len(self.pedfile.get_duplicated_markers()), 0)
+
+        # Copying the BED and the FAM to the temporary directory
+        new_prefix = os.path.join(self.tmp_dir, "with_dup")
+        for suffix in (".bed", ".fam"):
+            shutil.copyfile(self.prefix + suffix, new_prefix + suffix)
+
+        # Copying the BIM file to the temporary directory
+        shutil.copyfile(self.prefix + "_with_dup.bim", new_prefix + ".bim")
+
+        # Reading the new files
+        pedfile = pyplink.PyPlink(new_prefix)
+
+        # Checking a warning was called
+        self.assertTrue(pyplink_logger.warning.called)
+
+        # Checking the BIM
+        chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        markers = ["rs10399749", "rs2949420:dup1", "rs2949421", "rs2691310",
+                   "rs4030303:dup1", "rs4030303:dup2", "rs4030303:dup3",
+                   "rs940550:dup1", "rs940550:dup2", "rs2949420:dup2"]
+        positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032,
+                     81468, 222077]
+        cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"]
+        a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"]
+
+        # Getting the BIM file
+        bim = pedfile.get_bim()
+
+        # Checking the columns
+        self.assertTrue(
+            set(bim.columns.values) == {"chrom", "pos", "cm", "a1", "a2"}
+        )
+
+        # Checking the indexes
+        self.assertTrue(set(bim.index.values) == set(markers))
+
+        # Checking the values for the markers
+        zipped = zip(markers, chromosomes, positions, cms, a1s, a2s)
+        for marker, chrom, pos, cm, a1, a2 in zipped:
+            self.assertEqual(chrom, bim.loc[marker, "chrom"])
+            self.assertEqual(pos, bim.loc[marker, "pos"])
+            self.assertEqual(cm, bim.loc[marker, "cm"])
+            self.assertEqual(a1, bim.loc[marker, "a1"])
+            self.assertEqual(a2, bim.loc[marker, "a2"])
+
+        # Checking only one duplicated markers
+        for i, marker in enumerate(markers):
+            geno = pedfile.get_geno_marker(marker)
+            np.testing.assert_array_equal(geno, self.genotypes[i])
+
+        # Checking the list of duplicated markers
+        self.assertEqual(
+            set(m.split(":")[0] for m in markers if ":" in m),
+            pedfile.get_duplicated_markers(),
+        )
+
+        # Closing the file
+        pedfile.close()

From 44d85e0828ad04404af466f58cc49cbba9b0ea26 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 16:10:55 -0500
Subject: [PATCH 07/18] Added the dependencies for mock (pyhon 2)

---
 setup.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e27588e..4589db7 100644
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@
 
 
 import os
+import sys
 from setuptools import setup
 
 
@@ -39,6 +40,17 @@ def write_version_file(fn=None):
         a.close()
 
 
+def get_requirements():
+    # Initial requirements
+    requirements = ["numpy >= 1.8.2", "pandas >= 0.14.1", "six >= 1.9.0"]
+
+    # Checking if python 2 (requires mock)
+    if sys.version_info[0] == 2:
+        requirements.append(["mock >= 2.0.0"])
+
+    return requirements
+
+
 def setup_package():
     # Saving the version into a file
     write_version_file()
@@ -55,8 +67,7 @@ def setup_package():
         packages=["pyplink", "pyplink.tests"],
         package_data={"pyplink.tests": ["data/test_data.*"], },
         test_suite="pyplink.tests.test_suite",
-        install_requires=["numpy >= 1.8.2", "pandas >= 0.14.1",
-                          "six >= 1.9.0"],
+        install_requires=get_requirements(),
         classifiers=["Operating System :: POSIX :: Linux",
                      "Operating System :: MacOS :: MacOS X",
                      "Operating System :: Microsoft",

From 036b23f4cf177f1cf5d2253f5a645848d9f0735d Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 16:21:27 -0500
Subject: [PATCH 08/18] Changed the chromosome and cm of the test datasets

---
 pyplink/tests/data/test_data.bim          | 18 +++++++++---------
 pyplink/tests/data/test_data_with_dup.bim | 18 +++++++++---------
 pyplink/tests/test_pyplink.py             |  9 ++++-----
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/pyplink/tests/data/test_data.bim b/pyplink/tests/data/test_data.bim
index 2bb9231..10df21c 100644
--- a/pyplink/tests/data/test_data.bim
+++ b/pyplink/tests/data/test_data.bim
@@ -1,10 +1,10 @@
 1	rs10399749	0	45162	G	C
-1	rs2949420	0	45257	C	T
-1	rs2949421	0	45413	0	0
-1	rs2691310	0	46844	A	T
-1	rs4030303	0	72434	0	G
-1	rs4030300	0	72515	0	C
-1	rs3855952	0	77689	G	A
-1	rs940550	0	78032	0	T
-1	rs13328714	0	81468	G	C
-1	rs11490937	0	222077	A	G
+2	rs2949420	1	45257	C	T
+3	rs2949421	1	45413	0	0
+4	rs2691310	2	46844	A	T
+4	rs4030303	2	72434	0	G
+5	rs4030300	3	72515	0	C
+6	rs3855952	4	77689	G	A
+6	rs940550	4	78032	0	T
+6	rs13328714	5	81468	G	C
+8	rs11490937	6	222077	A	G
diff --git a/pyplink/tests/data/test_data_with_dup.bim b/pyplink/tests/data/test_data_with_dup.bim
index df83cbe..a67858a 100644
--- a/pyplink/tests/data/test_data_with_dup.bim
+++ b/pyplink/tests/data/test_data_with_dup.bim
@@ -1,10 +1,10 @@
 1	rs10399749	0	45162	G	C
-1	rs2949420	0	45257	C	T
-1	rs2949421	0	45413	0	0
-1	rs2691310	0	46844	A	T
-1	rs4030303	0	72434	0	G
-1	rs4030303	0	72515	0	C
-1	rs4030303	0	77689	G	A
-1	rs940550	0	78032	0	T
-1	rs940550	0	81468	G	C
-1	rs2949420	0	222077	A	G
+2	rs2949420	1	45257	C	T
+3	rs2949421	1	45413	0	0
+4	rs2691310	2	46844	A	T
+4	rs4030303	2	72434	0	G
+5	rs4030303	3	72515	0	C
+6	rs4030303	4	77689	G	A
+6	rs940550	4	78032	0	T
+6	rs940550	5	81468	G	C
+8	rs2949420	6	222077	A	G
diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index 4af424a..afee1d0 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -331,17 +331,16 @@ def test_get_nb_samples_w_mode(self):
                 p.get_nb_samples()
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
-    # TODO: Change some chromosomes and cm just to be sure
     def test_get_bim(self):
         """Tests the 'get_bim' function."""
         # The original BIM file (with the 'i' column)
         ori_bim = self.pedfile._bim
 
         # The expected values
-        chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8]
         positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032,
                      81468, 222077]
-        cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6]
         a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"]
         a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"]
 
@@ -1094,13 +1093,13 @@ def test_dup_markers(self, pyplink_logger):
         self.assertTrue(pyplink_logger.warning.called)
 
         # Checking the BIM
-        chromosomes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        chromosomes = [1, 2, 3, 4, 4, 5, 6, 6, 6, 8]
         markers = ["rs10399749", "rs2949420:dup1", "rs2949421", "rs2691310",
                    "rs4030303:dup1", "rs4030303:dup2", "rs4030303:dup3",
                    "rs940550:dup1", "rs940550:dup2", "rs2949420:dup2"]
         positions = [45162, 45257, 45413, 46844, 72434, 72515, 77689, 78032,
                      81468, 222077]
-        cms = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        cms = [0, 1, 1, 2, 2, 3, 4, 4, 5, 6]
         a1s = ["G", "C", "0", "A", "0", "0", "G", "0", "G", "A"]
         a2s = ["C", "T", "0", "T", "G", "C", "A", "T", "C", "G"]
 

From 43d24d62ad0e2c902aaefb8d118870f725b8a4ae Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 16:26:22 -0500
Subject: [PATCH 09/18] Added mock at installation

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 33cf18c..d90e449 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,6 +9,7 @@ install:
   - pip install six
   - pip install numpy
   - pip install pandas
+  - pip install mock
   - pip install coveralls
 script:
   - coverage run setup.py test

From 67f63f34d5a5ef1e83d30294a7801b1dd57d0c10 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Thu, 9 Mar 2017 16:32:37 -0500
Subject: [PATCH 10/18] Travis...

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index d90e449..7668ff0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,7 @@ install:
   - pip install six
   - pip install numpy
   - pip install pandas
-  - pip install mock
+  - pip install -U mock
   - pip install coveralls
 script:
   - coverage run setup.py test

From d1c177778471be79b2cc4632f97a9706362ef5e6 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:09:19 -0500
Subject: [PATCH 11/18] Updated README

---
 README.mkd | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.mkd b/README.mkd
index d9d0e65..4e194f4 100644
--- a/README.mkd
+++ b/README.mkd
@@ -67,12 +67,11 @@ conda update pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink
 
 To test the module, just perform the following command:
 
-```python
->>> import pyplink
->>> pyplink.test()
+```console
+$ python -m pyplink.tests
 .............................................
 ----------------------------------------------------------------------
-Ran 45 tests in 0.480s
+Ran 45 tests in 0.334s
 
 OK
 ```

From 3cc9f6b0fa077ece50e189a94cc0ed120e0bc81f Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:09:39 -0500
Subject: [PATCH 12/18] Better docstrings

---
 pyplink/__init__.py |   4 +-
 pyplink/pyplink.py  | 168 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/pyplink/__init__.py b/pyplink/__init__.py
index 39c8ed0..dc8f1b0 100644
--- a/pyplink/__init__.py
+++ b/pyplink/__init__.py
@@ -25,7 +25,7 @@
 # THE SOFTWARE.
 
 
-from .pyplink import *
+from .pyplink import PyPlink
 
 try:
     from .version import pyplink_version as __version__
@@ -46,7 +46,7 @@ def test(verbosity=1):  # pragma: no cover
     """Executes all the tests for pyplink.
 
     Args:
-        verbosity (int): the verbosity level for unittest
+        verbosity (int): The verbosity level for :py:mod:`unittest`.
 
     Just set ``verbosity`` to an integer higher than ``1`` to have more
     information about the tests.
diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 8bda865..6c23c9b 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -63,7 +63,16 @@
 
 
 class PyPlink(object):
-    """Reads and store a set of binary Plink files."""
+    """Reads and store a set of binary Plink files.
+
+    Args:
+        prefix (str): The prefix of the binary Plink files.
+        mode (str): The open mode for the binary Plink file.
+        bed_format (str): The type of bed (SNP-major or INDIVIDUAL-major).
+
+    Reads or write binary Plink files (BED, BIM and FAM).
+
+    """
 
     # The genotypes values
     _geno_values = np.array(
@@ -74,18 +83,8 @@ class PyPlink(object):
         dtype=np.int8,
     )
 
-    def __init__(self, i_prefix, mode="r", bed_format="SNP-major"):
-        """Initializes a new PyPlink object.
-
-        Args:
-            i_prefix (str): the prefix of the binary Plink files
-            mode (str): the open mode for the binary Plink file
-            nb_samples (int): the number of samples
-            bed_format (str): the type of bed (SNP-major or INDIVIDUAL-major)
-
-        Reads or write binary Plink files (BED, BIM and FAM).
-
-        """
+    def __init__(self, prefix, mode="r", bed_format="SNP-major"):
+        """Initializes a new PyPlink instance."""
         # The mode
         self._mode = mode
 
@@ -95,9 +94,9 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"):
         self._bed_format = bed_format
 
         # These are the name of the files
-        self.bed_filename = "{}.bed".format(i_prefix)
-        self.bim_filename = "{}.bim".format(i_prefix)
-        self.fam_filename = "{}.fam".format(i_prefix)
+        self.bed_filename = "{}.bed".format(prefix)
+        self.bim_filename = "{}.bim".format(prefix)
+        self.fam_filename = "{}.fam".format(prefix)
 
         if self._mode == "r":
             if self._bed_format != "SNP-major":
@@ -163,12 +162,18 @@ def __exit__(self, *args):
         self.close()
 
     def close(self):
-        """Closes the BED file if required."""
+        """Closes the BED file."""
         # Closing the BED file
         self._bed.close()
 
     def next(self):
-        """The next function."""
+        """Returns the next marker.
+
+        Returns:
+            tuple: The marker name as a string and its genotypes as a
+            :py:class:`numpy.ndarray`.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -185,7 +190,12 @@ def _read_current_marker(self):
         ].flatten(order="C")[:self._nb_samples]
 
     def seek(self, n):
-        """Gets to a certain position in the BED file when iterating."""
+        """Gets to a certain marker position in the BED file.
+
+        Args:
+            n (int): The index of the marker to seek to.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -198,7 +208,12 @@ def seek(self, n):
             raise ValueError("invalid position in BED: {}".format(n))
 
     def _get_seek_position(self, n):
-        """Gets the seek position in the file (including special bytes)."""
+        """Gets the seek position in the file (including special bytes).
+
+        Args:
+            n (int): The index of the marker to seek to.
+
+        """
         return 3 + self._nb_bytes * n
 
     def _read_bim(self):
@@ -260,21 +275,36 @@ def _read_bim(self):
         self._nb_markers = self._bim.shape[0]
 
     def get_bim(self):
-        """Returns the BIM file."""
+        """Returns the BIM file.
+
+        Returns:
+            pandas.DataFrame: The BIM file.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
         return self._bim.drop("i", axis=1)
 
     def get_nb_markers(self):
-        """Returns the number of markers."""
+        """Returns the number of markers.
+
+        Returns:
+            int: The number of markers in the dataset.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
         return self._nb_markers
 
     def get_duplicated_markers(self):
-        """Returns the duplicated markers, if any."""
+        """Returns the duplicated markers, if any.
+
+        Args:
+            set: The set of duplicated marker (might be empty).
+
+        """
         if self._has_duplicated:
             return self._dup_markers
         else:
@@ -299,14 +329,24 @@ def _read_fam(self):
         self._nb_samples = self._fam.shape[0]
 
     def get_fam(self):
-        """Returns the FAM file."""
+        """Returns the FAM file.
+
+        Returns:
+            pandas.DataFrame: The FAM file.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
         return self._fam.drop(["byte", "bit"], axis=1)
 
     def get_nb_samples(self):
-        """Returns the number of samples."""
+        """Returns the number of samples.
+
+        Returns:
+            int: The number of samples in the dataset.
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -354,7 +394,13 @@ def _write_bed_header(self):
         self._bed.write(bytearray((108, 27, final_byte)))
 
     def iter_geno(self):
-        """Iterates over genotypes from the beginning."""
+        """Iterates over genotypes from the beginning of the BED file.
+
+        Returns:
+            tuple: The name of the marker as a string, and its genotypes as a
+            :py:class:`numpy.ndarray` (additive format).
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -365,13 +411,29 @@ def iter_geno(self):
         return self
 
     def iter_acgt_geno(self):
-        """Iterates over genotypes (ACGT format)."""
+        """Iterates over genotypes (ACGT format).
+
+        Returns:
+            tuple: The name of the marker as a string, and its genotypes as a
+            :py:class:`numpy.ndarray` (ACGT format).
+
+        """
         # Need to iterate over itself, and modify the actual genotypes
         for i, (marker, geno) in enumerate(self.iter_geno()):
             yield marker, self._allele_encoding[i][geno]
 
     def iter_geno_marker(self, markers, return_seek=False):
-        """Iterates over genotypes for given markers."""
+        """Iterates over genotypes for a list of markers.
+
+        Args:
+            markers (list): The list of markers to iterate onto.
+            return_seek (bool): Wether to return the marker's index or not.
+
+        Returns:
+            tuple: The name of the marker as a string, and its genotypes as a
+            :py:class:`numpy.ndarray` (additive format).
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -389,14 +451,32 @@ def iter_geno_marker(self, markers, return_seek=False):
                 yield marker, self.get_geno_marker(marker)
 
     def iter_acgt_geno_marker(self, markers):
-        """Iterates over genotypes for given markers (ACGT format)."""
+        """Iterates over genotypes for a list of markers (ACGT format).
+
+        Args:
+            markers (list): The list of markers to iterate onto.
+
+        Returns:
+            tuple: The name of the marker as a string, and its genotypes as a
+            :py:class:`numpy.ndarray` (ACGT format).
+
+        """
         # We iterate over the markers
         for snp, geno, s in self.iter_geno_marker(markers, return_seek=True):
             # Getting the SNP position and converting to ACGT
             yield snp, self._allele_encoding[s][geno]
 
     def get_geno_marker(self, marker, return_seek=False):
-        """Gets the genotypes for a given marker."""
+        """Gets the genotypes for a given marker.
+
+        Args:
+            marker (str): The name of the marker.
+            return_seek (bool): Wether to return the marker's index or not.
+
+        Returns:
+            numpy.ndarray: The genotypes of the marker (additive format).
+
+        """
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
@@ -413,7 +493,15 @@ def get_geno_marker(self, marker, return_seek=False):
         return self._read_current_marker()
 
     def get_acgt_geno_marker(self, marker):
-        """Gets the genotypes for a given marker (ACGT format)."""
+        """Gets the genotypes for a given marker (ACGT format).
+
+        Args:
+            marker (str): The name of the marker.
+
+        Returns:
+            numpy.ndarray: The genotypes of the marker (ACGT format).
+
+        """
         # Getting the marker's genotypes
         geno, snp_position = self.get_geno_marker(marker, return_seek=True)
 
@@ -421,7 +509,12 @@ def get_acgt_geno_marker(self, marker):
         return self._allele_encoding[snp_position][geno]
 
     def write_genotypes(self, genotypes):
-        """Write genotypes to binary file."""
+        """Write genotypes to binary file.
+
+        Args:
+            genotypes (numpy.ndarray): The genotypes to write in the BED file.
+
+        """
         if self._mode != "w":
             raise UnsupportedOperation("not available in 'r' mode")
 
@@ -445,6 +538,15 @@ def write_genotypes(self, genotypes):
 
     @staticmethod
     def _grouper(iterable, n, fillvalue=0):
-        """Collect data into fixed-length chunks or blocks."""
+        """Collect data into fixed-length chunks or blocks.
+
+        Args:
+            n (int): The size of the chunk.
+            fillvalue (int): The fill value.
+
+        Returns:
+            iterator: An iterator over the chunks.
+
+        """
         args = [iter(iterable)] * n
         return zip_longest(fillvalue=fillvalue, *args)

From b0e81d445f25a90194a9d601ccbed4dc8da664ab Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:10:04 -0500
Subject: [PATCH 13/18] Now possible to execute tests using 'python -m
 pyplink.tests'.

---
 pyplink/tests/__main__.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 pyplink/tests/__main__.py

diff --git a/pyplink/tests/__main__.py b/pyplink/tests/__main__.py
new file mode 100644
index 0000000..96cdc50
--- /dev/null
+++ b/pyplink/tests/__main__.py
@@ -0,0 +1,36 @@
+# This file is part of pyplink.
+#
+# The MIT License (MIT)
+#
+# Copyright (c) 2014 Louis-Philippe Lemieux Perreault
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+
+import unittest
+
+from . import test_suite
+
+
+__author__ = "Louis-Philippe Lemieux Perreault"
+__copyright__ = "Copyright 2014 Louis-Philippe Lemieux Perreault"
+__license__ = "MIT"
+
+
+unittest.TextTestRunner(verbosity=1).run(test_suite)

From c0d5d9a1c0df605f093079d89f1a1c45893a05da Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:10:57 -0500
Subject: [PATCH 14/18] Added a small documentation

---
 .gitignore            |   1 +
 docs/Makefile         |  20 +++++
 docs/conf.py          | 186 ++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst        |  39 +++++++++
 docs/installation.rst |  30 +++++++
 docs/pyplink.rst      |  11 +++
 6 files changed, 287 insertions(+)
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/installation.rst
 create mode 100644 docs/pyplink.rst

diff --git a/.gitignore b/.gitignore
index bb6d856..833249c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ htmlcov
 .ipynb_checkpoints
 
 build
+_build
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..bcc041b
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = pyplink
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..b2a1082
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# pyplink documentation build configuration file, created by
+# sphinx-quickstart on Fri Mar 10 09:00:31 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.githubpages',
+    'sphinx.ext.napoleon',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'pyplink'
+copyright = '2017, Louis-Philippe Lemieux Perreault'
+author = 'Louis-Philippe Lemieux Perreault'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+import pyplink
+version = ".".join(pyplink.__version__.split(".")[:-1])
+release = pyplink.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+
+html_theme_options = {
+    'github_user': 'lemieuxl',
+    'github_repo': 'pyplink',
+    'github_button': False,
+    'fixed_sidebar': True,
+}
+
+html_sidebars = {
+    '**': [
+        'about.html',
+        'navigation.html',
+        'relations.html',
+        'searchbox.html',
+        'donate.html',
+    ]
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pyplinkdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pyplink.tex', 'pyplink Documentation',
+     'Louis-Philippe Lemieux Perreault', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pyplink', 'pyplink Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'pyplink', 'pyplink Documentation',
+     author, 'pyplink', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'https://docs.python.org/3': None,
+    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+    'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None),
+}
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..b9aca61
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,39 @@
+.. pyplink documentation master file, created by
+   sphinx-quickstart on Fri Mar 10 09:00:31 2017.
+
+pyplink
+========
+
+:py:mod:`pyplink` is a Python (2 and 3) binary Plink file parser and writer
+released under the MIT licence. The difference with existing parsers (and Plink
+itself) is that :py:mod:`pyplink` does not load the BED file in memory, making
+possible to work with extremely large files (*e.g.* 1000 Genomes Phase 3
+files).
+
+For more information on how to use :py:mod:`pyplink`, refer to the
+:doc:`API documentation <pyplink>`. Below is a snippet describing the most
+common usage of the module.
+
+
+.. code-block:: python
+
+    from pyplink import PyPlink
+
+    with PyPlink("plink_file_prefix") as bed:
+        # Getting the BIM and FAM
+        bim = bed.get_bim()
+        fam = bed.get_fam()
+
+        # Iterating over all loci
+        for loci_name, genotypes in bed:
+            pass
+
+        # Getting the genotypes of a single marker (numpy.ndarray)
+        genotypes = bed.get_geno_marker("rs12345")
+
+
+.. toctree::
+   :hidden:
+
+   installation
+   pyplink
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..c7951dc
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,30 @@
+
+Installation
+=============
+
+You can install :py:mod:`pyplink` using either ``pip`` or ``conda``.
+
+.. code-block:: bash
+
+   # Using pip
+   pip install pyplink
+
+
+.. code-block:: bash
+
+   # Using conda
+   conda config --add channels http://statgen.org/wp-content/uploads/Softwares/pyplink
+   conda install pyplink
+
+
+To test the installation, run ``python`` and execute the following commands.
+
+.. code-block:: python
+
+   >>> import pyplink
+   >>> pyplink.test()
+   .............................................
+   ----------------------------------------------------------------------
+   Ran 45 tests in 0.343s
+
+   OK
diff --git a/docs/pyplink.rst b/docs/pyplink.rst
new file mode 100644
index 0000000..f9315a3
--- /dev/null
+++ b/docs/pyplink.rst
@@ -0,0 +1,11 @@
+
+pyplink's API
+==============
+
+.. autoclass:: pyplink.PyPlink
+   :members:
+
+
+.. automodule:: pyplink
+   :members:
+

From 7a5612de0b2b67a42e005966f775dfa532fac5ca Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:20:57 -0500
Subject: [PATCH 15/18] Added a small snippet in the docstring

---
 pyplink/pyplink.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 6c23c9b..1839d46 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -72,6 +72,18 @@ class PyPlink(object):
 
     Reads or write binary Plink files (BED, BIM and FAM).
 
+    .. code-block:: python
+
+        from pyplink import PyPlink
+
+        # Reading BED files
+        with PyPlink("plink_file_prefix") as bed:
+            pass
+
+        # Writing BED files
+        with PyPlink("plink_file_prefix", "w") as bed:
+            pass
+
     """
 
     # The genotypes values

From 99d8c6079019035ec3028219cd4906196e40e458 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 11:25:38 -0500
Subject: [PATCH 16/18] Changed return_seek for return_index (made more sense)

---
 pyplink/pyplink.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 1839d46..932d6c5 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -387,8 +387,8 @@ def _read_bed(self):
                                  "{}".format(self.bed_filename))
 
             # Checking the last entry (for BED corruption)
-            seek_position = self._get_seek_position(self._bim.iloc[-1, :].i)
-            bed_file.seek(seek_position)
+            seek_index = self._get_seek_position(self._bim.iloc[-1, :].i)
+            bed_file.seek(seek_index)
             geno = self._geno_values[
                 np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8)
             ].flatten(order="C")[:self._nb_samples]
@@ -434,12 +434,12 @@ def iter_acgt_geno(self):
         for i, (marker, geno) in enumerate(self.iter_geno()):
             yield marker, self._allele_encoding[i][geno]
 
-    def iter_geno_marker(self, markers, return_seek=False):
+    def iter_geno_marker(self, markers, return_index=False):
         """Iterates over genotypes for a list of markers.
 
         Args:
             markers (list): The list of markers to iterate onto.
-            return_seek (bool): Wether to return the marker's index or not.
+            return_index (bool): Wether to return the marker's index or not.
 
         Returns:
             tuple: The name of the marker as a string, and its genotypes as a
@@ -454,9 +454,9 @@ def iter_geno_marker(self, markers, return_seek=False):
             markers = [markers]
 
         # Iterating over all markers
-        if return_seek:
+        if return_index:
             for marker in markers:
-                geno, seek = self.get_geno_marker(marker, return_seek=True)
+                geno, seek = self.get_geno_marker(marker, return_index=True)
                 yield marker, geno, seek
         else:
             for marker in markers:
@@ -474,16 +474,16 @@ def iter_acgt_geno_marker(self, markers):
 
         """
         # We iterate over the markers
-        for snp, geno, s in self.iter_geno_marker(markers, return_seek=True):
+        for snp, geno, s in self.iter_geno_marker(markers, return_index=True):
             # Getting the SNP position and converting to ACGT
             yield snp, self._allele_encoding[s][geno]
 
-    def get_geno_marker(self, marker, return_seek=False):
+    def get_geno_marker(self, marker, return_index=False):
         """Gets the genotypes for a given marker.
 
         Args:
             marker (str): The name of the marker.
-            return_seek (bool): Wether to return the marker's index or not.
+            return_index (bool): Wether to return the marker's index or not.
 
         Returns:
             numpy.ndarray: The genotypes of the marker (additive format).
@@ -497,11 +497,11 @@ def get_geno_marker(self, marker, return_seek=False):
             raise ValueError("{}: marker not in BIM".format(marker))
 
         # Seeking to the correct position
-        seek_position = self._bim.loc[marker, "i"]
-        self.seek(seek_position)
+        seek_index = self._bim.loc[marker, "i"]
+        self.seek(seek_index)
 
-        if return_seek:
-            return self._read_current_marker(), seek_position
+        if return_index:
+            return self._read_current_marker(), seek_index
         return self._read_current_marker()
 
     def get_acgt_geno_marker(self, marker):
@@ -515,7 +515,7 @@ def get_acgt_geno_marker(self, marker):
 
         """
         # Getting the marker's genotypes
-        geno, snp_position = self.get_geno_marker(marker, return_seek=True)
+        geno, snp_position = self.get_geno_marker(marker, return_index=True)
 
         # Returning the ACGT's format of the genotypes
         return self._allele_encoding[snp_position][geno]

From a1361062b6fc309371ca1535be851c9e560e06b1 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 12:49:31 -0500
Subject: [PATCH 17/18] Added the link to the short documentation

---
 README.mkd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.mkd b/README.mkd
index 4e194f4..08c125d 100644
--- a/README.mkd
+++ b/README.mkd
@@ -5,7 +5,9 @@
 
 # pyplink - Module to process Plink's binary files
 
-`PyPlink` is a Python module to read and write Plink's binary files.
+`PyPlink` is a Python module to read and write Plink's binary files. Short
+documentation available at
+[https://lemieuxl.github.io/pyplink/](https://lemieuxl.github.io/pyplink/).
 
 
 ## Dependencies

From d8086c41111aae066f1961d52b2fdeef581af9e1 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 10 Mar 2017 12:50:40 -0500
Subject: [PATCH 18/18] For next release

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4589db7..1592560 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 
 MAJOR = 1
 MINOR = 3
-MICRO = 1
+MICRO = 2
 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO)