From ab21492a1ba92fdda7417c440661d96e1b145ed8 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:01:47 -0500
Subject: [PATCH 01/15] Refactor so that PyPlink doesn't load all into memory
 (for large datasets)

---
 pyplink/pyplink.py | 119 +++++++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 53 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index ba52dc9..7a37b5f 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -125,7 +125,7 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"):
             self._nb_values = None
 
             # Opening the output BED file
-            self._bed_file = open(self.bed_filename, "wb")
+            self._bed = open(self.bed_filename, "wb")
             self._write_bed_header()
 
         else:
@@ -162,36 +162,43 @@ def __exit__(self, *args):
 
     def close(self):
         """Closes the BED file if required."""
-        if self._mode == "w":
-            # Closing the BED file
-            self._bed_file.close()
+        # Closing the BED file
+        self._bed.close()
 
     def next(self):
         """The next function."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        if self._n < self._nb_markers:
-            self._n += 1
-
-            # We want to return information about the marker and the genotypes
-            geno = self._geno_values[self._bed[self._n - 1]].flatten(order="C")
-            return self._markers[self._n - 1], geno[:self._nb_samples]
-        else:
+        self._n += 1
+        if self._n > self._nb_markers:
             raise StopIteration()
 
+        return self._markers[self._n - 1], self._read_current_marker()
+
+    def _read_current_marker(self):
+        """Reads the current marker and returns its genotypes."""
+        return self._geno_values[
+            np.fromstring(self._bed.read(self._nb_bytes), dtype=np.uint8)
+        ].flatten(order="C")[:self._nb_samples]
+
     def seek(self, n):
         """Gets to a certain position in the BED file when iterating."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        if 0 <= n < len(self._bed):
+        if 0 <= n < self._nb_markers:
             self._n = n
+            self._bed.seek(self._get_seek_position(n))
 
         else:
             # Invalid seek value
             raise ValueError("invalid position in BED: {}".format(n))
 
+    def _get_seek_position(self, n):
+        """Gets the seek position in the file (including special bytes)."""
+        return 3 + self._nb_bytes * n
+
     def _read_bim(self):
         """Reads the BIM file."""
         # Reading the BIM file and setting the values
@@ -204,7 +211,7 @@ def _read_bim(self):
         bim["a2"] = bim["a2"].astype(str)
 
         bim = bim.set_index("snp")
-        bim["i"] = range(len(bim))
+        bim["i"] = range(bim.shape[0])
         bim[2] = bim.a1 * 2           # Original '0'
         bim[1] = bim.a1 + bim.a2      # Original '2'
         bim[0] = bim.a2 * 2           # Original '3'
@@ -219,7 +226,7 @@ def _read_bim(self):
 
         # Saving the data in the object
         self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]]
-        self._nb_markers = len(self._bim)
+        self._nb_markers = self._bim.shape[0]
         self._markers = np.array(list(self._bim.index))
 
     def get_bim(self):
@@ -256,7 +263,7 @@ def _read_fam(self):
 
         # Saving the data in the object
         self._fam = fam
-        self._nb_samples = len(self._fam)
+        self._nb_samples = self._fam.shape[0]
 
     def get_fam(self):
         """Returns the FAM file."""
@@ -278,7 +285,7 @@ def _read_bed(self):
         if (self._bim is None) or (self._fam is None):
             raise RuntimeError("no BIM or FAM file were read")
 
-        data = None
+        # Checking the file is valid by looking at the first 3 bytes
         with open(self.bed_filename, "rb") as bed_file:
             # Checking that the first two bytes are OK
             if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27):
@@ -290,45 +297,41 @@ def _read_bed(self):
                 raise ValueError("not in SNP-major format (please recode): "
                                  "{}".format(self.bed_filename))
 
-            # The number of bytes per marker
-            nb_bytes = int(np.ceil(self._nb_samples / 4.0))
-
-            # Reading the data
-            data = np.fromfile(bed_file, dtype=np.uint8)
+        # The number of bytes per marker
+        self._nb_bytes = int(np.ceil(self._nb_samples / 4.0))
 
-            # Checking the data
-            if data.shape[0] != (self._nb_markers * nb_bytes):
-                raise ValueError("invalid number of entries: "
-                                 "{}".format(self.bed_filename))
-            data.shape = (self._nb_markers, nb_bytes)
-
-        # Saving the data in the object
-        self._bed = data
+        # Opening the file for the rest of the operations (reading 3 bytes)
+        self._bed = open(self.bed_filename, "rb")
+        self._bed.read(3)
 
     def _write_bed_header(self):
         """Writes the BED first 3 bytes."""
         # Writing the first three bytes
         final_byte = 1 if self._bed_format == "SNP-major" else 0
-        self._bed_file.write(bytearray((108, 27, final_byte)))
+        self._bed.write(bytearray((108, 27, final_byte)))
 
     def iter_geno(self):
-        """Iterates over genotypes."""
+        """Iterates over genotypes from the beginning."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        for i in range(len(self._bed)):
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield self._markers[i], geno[:self._nb_samples]
+        # Seeking back at the beginning of the file
+        self.seek(0)
+
+        # Return itself (the generator)
+        return self
 
     def iter_acgt_geno(self):
         """Iterates over genotypes (ACGT format)."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        for i in range(len(self._bed)):
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield (self._markers[i],
-                   self._allele_encoding[i][geno[:self._nb_samples]])
+        # Seeking back at the beginning of the file
+        self.seek(0)
+
+        # Need to iterate over itself, and modify the actual genotypes
+        for i, (marker, geno) in enumerate(self):
+            yield marker, self._allele_encoding[i][geno]
 
     def iter_geno_marker(self, markers):
         """Iterates over genotypes for given markers."""
@@ -350,9 +353,12 @@ def iter_geno_marker(self, markers):
         required_markers = self._bim.loc[markers, :]
 
         # Then, we iterate
-        for snp, i in required_markers.i.iteritems():
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield snp, geno[:self._nb_samples]
+        for snp, seek_position in required_markers.i.iteritems():
+            # Seeking to the correct position
+            self.seek(seek_position)
+
+            # Getting the value
+            yield snp, self._read_current_marker()
 
     def get_geno_marker(self, marker):
         """Gets the genotypes for a given marker."""
@@ -363,11 +369,11 @@ def get_geno_marker(self, marker):
         if marker not in set(self._bim.index):
             raise ValueError("{}: marker not in BIM".format(marker))
 
-        # Getting all the genotypes
-        i = self._bim.loc[marker, "i"]
-        geno = self._geno_values[self._bed[i]].flatten(order="C")
+        # Seeking to the correct position
+        seek_position = self._bim.loc[marker, "i"]
+        self.seek(seek_position)
 
-        return geno[:self._nb_samples]
+        return self._read_current_marker()
 
     def iter_acgt_geno_marker(self, markers):
         """Iterates over genotypes for given markers (ACGT format)."""
@@ -389,9 +395,13 @@ def iter_acgt_geno_marker(self, markers):
         required_markers = self._bim.loc[markers, :]
 
         # Then, we iterate
-        for snp, i in required_markers.i.iteritems():
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield snp, self._allele_encoding[i][geno[:self._nb_samples]]
+        for snp, seek_position in required_markers.i.iteritems():
+            # Seeking to the correct position
+            self.seek(seek_position)
+
+            # Getting the genotype and converting to ACGT
+            geno = self._read_current_marker()
+            yield snp, self._allele_encoding[seek_position][geno]
 
     def get_acgt_geno_marker(self, marker):
         """Gets the genotypes for a given marker (ACGT format)."""
@@ -402,11 +412,14 @@ def get_acgt_geno_marker(self, marker):
         if marker not in set(self._bim.index):
             raise ValueError("{}: marker not in BIM".format(marker))
 
-        # Getting all the genotypes
-        i = self._bim.loc[marker, "i"]
-        geno = self._geno_values[self._bed[i]].flatten(order="C")
+        # Seeking to the correct position
+        seek_position = self._bim.loc[marker, "i"]
+        self.seek(seek_position)
+
+        # Getting the genotypes and converting them to ACGT
+        geno = self._read_current_marker()
 
-        return self._allele_encoding[i][geno[:self._nb_samples]]
+        return self._allele_encoding[seek_position][geno]
 
     def write_marker(self, genotypes):
         """Deprecated function."""
@@ -434,7 +447,7 @@ def write_genotypes(self, genotypes):
             g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in
             self._grouper((_byte_recode[geno] for geno in genotypes), 4)
         ]
-        self._bed_file.write(bytearray(byte_array))
+        self._bed.write(bytearray(byte_array))
 
     @staticmethod
     def _grouper(iterable, n, fillvalue=0):

From ea4382e5d86e6316c500bd0ce51d3dc3eae74496 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:02:19 -0500
Subject: [PATCH 02/15] Updated tests to work with current changes

---
 pyplink/tests/test_pyplink.py | 119 ++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 55 deletions(-)

diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index c07dc06..2393890 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -98,7 +98,6 @@ def get_plink(tmp_dir):
             return None, "Plink's URL is not available"
 
         # Unzipping Plink
-        plink_dir = os.path.join(tmp_dir, )
         with zipfile.ZipFile(zip_path, "r") as z:
             z.extractall(tmp_dir)
         plink_path = os.path.join(tmp_dir, os.path.splitext(filename)[0],
@@ -177,16 +176,16 @@ def tearDownClass(cls):
         # Cleaning the temporary directory
         shutil.rmtree(cls.tmp_dir)
 
+    def tearDown(self):
+        # Closing the PyPlink object
+        self.pedfile.close()
+
     def test_pyplink_object_integrity(self):
         """Checks the integrity of the PyPlink object."""
         # Checking the name of the BED file
         self.assertTrue(hasattr(self.pedfile, "bed_filename"))
         self.assertEqual(self.bed, self.pedfile.bed_filename)
 
-        # Checking the BED object
-        self.assertTrue(hasattr(self.pedfile, "_bed"))
-        self.assertTrue(isinstance(self.pedfile._bed, np.ndarray))
-
         # Checking the name of the BIM file
         self.assertTrue(hasattr(self.pedfile, "bim_filename"))
         self.assertEqual(self.bim, self.pedfile.bim_filename)
@@ -205,24 +204,21 @@ def test_pyplink_object_integrity(self):
 
     def test_pyplink_object_error(self):
         """Checks what happens when we play with the PyPlink object."""
-        # Creating a new object
-        data = PyPlink(self.prefix)
-
         # Changing the BIM to None
-        ori = data._bim
-        data._bim = None
+        ori = self.pedfile._bim
+        self.pedfile._bim = None
         with self.assertRaises(RuntimeError) as cm:
-            data._read_bed()
+            self.pedfile._read_bed()
         self.assertEqual("no BIM or FAM file were read", str(cm.exception))
-        data._bim = ori
+        self.pedfile._bim = ori
 
         # Changing the FAM to None
-        ori = data._fam
-        data._fam = None
+        ori = self.pedfile._fam
+        self.pedfile._fam = None
         with self.assertRaises(RuntimeError) as cm:
-            data._read_bed()
+            self.pedfile._read_bed()
         self.assertEqual("no BIM or FAM file were read", str(cm.exception))
-        data._fam = ori
+        self.pedfile._fam = ori
 
     def test_pyplink_bad_bed(self):
         """Checks what happens when we read a bad BED file."""
@@ -239,16 +235,18 @@ def test_pyplink_bad_bed(self):
         with open(new_fam, "w") as o_file, open(self.fam, "r") as i_file:
             o_file.write(i_file.read())
 
-        # Creating a new BED file with invalid number of bytes
-        new_bed = new_prefix + ".bed"
-        with open(new_bed, "wb") as o_file:
-            o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4]))
+        # TODO: Change to parse the actual file completely (since we no longer
+        # load to memory
+#        # Creating a new BED file with invalid number of bytes
+#        new_bed = new_prefix + ".bed"
+#        with open(new_bed, "wb") as o_file:
+#            o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4]))
 
-        # This should raise an exception
-        with self.assertRaises(ValueError) as cm:
-            PyPlink(new_prefix)
-        self.assertEqual("invalid number of entries: {}".format(new_bed),
-                         str(cm.exception))
+#        # This should raise an exception
+#        with self.assertRaises(ValueError) as cm:
+#            PyPlink(new_prefix)
+#        self.assertEqual("invalid number of entries: {}".format(new_bed),
+#                         str(cm.exception))
 
         # Creating a new BED file with invalid first byte
         new_bed = new_prefix + ".bed"
@@ -290,7 +288,7 @@ def test_missing_files(self):
         # Creating dummy BED/BIM/FAM files
         prefix = os.path.join(self.tmp_dir, "test_missing")
         for extension in (".bed", ".bim", ".fam"):
-            with open(prefix + extension, "w") as o_file:
+            with open(prefix + extension, "w"):
                 pass
 
         # Removing the files (one by one) and checking the exception is raised
@@ -300,7 +298,7 @@ def test_missing_files(self):
                 PyPlink(prefix)
             self.assertEqual("No such file: '{}'".format(prefix + extension),
                              str(cm.exception))
-            with open(prefix + extension, "w") as o_file:
+            with open(prefix + extension, "w"):
                 pass
 
     def test_get_nb_markers(self):
@@ -447,7 +445,7 @@ def test_generator(self):
         )
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # The generator should be empty
         remaining = [(marker, geno) for marker, geno in self.pedfile]
@@ -467,7 +465,7 @@ def test_next(self):
 
         # Comparing
         self.assertEqual(self.markers[0], marker)
-        self.assertTrue((self.genotypes[0] == genotypes).all())
+        np.testing.assert_array_equal(self.genotypes[0], genotypes)
 
     def test_next_w_mode(self):
         """Tests that an exception is raised when calling next in w mode."""
@@ -494,7 +492,7 @@ def test_seek(self):
         self.pedfile.seek(1)
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Seeking at the fourth position
         zipped = zip(
@@ -504,7 +502,17 @@ def test_seek(self):
         self.pedfile.seek(3)
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
+
+        # Seeking at the tenth position
+        zipped = zip(
+            [i for i in zip(self.markers[9:], self.genotypes[9:])],
+            self.pedfile,
+        )
+        self.pedfile.seek(9)
+        for (e_marker, e_geno), (marker, geno) in zipped:
+            self.assertEqual(e_marker, marker)
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Seeking at an invalid position
         with self.assertRaises(ValueError) as cm:
@@ -516,6 +524,11 @@ def test_seek(self):
             self.pedfile.seek(100)
         self.assertEqual("invalid position in BED: 100", str(cm.exception))
 
+        # Seeking at an invalid position
+        with self.assertRaises(ValueError) as cm:
+            self.pedfile.seek(10)
+        self.assertEqual("invalid position in BED: 10", str(cm.exception))
+
     def test_seek_w_mode(self):
         """Tests that an exception is raised if in write mode."""
         with self.assertRaises(UnsupportedOperation) as cm:
@@ -532,7 +545,7 @@ def test_iter_geno(self):
         )
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
     def test_iter_geno_w_mode(self):
         """Tests that an exception is raised if in write mode."""
@@ -550,7 +563,7 @@ def test_iter_acgt_geno(self):
         )
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
     def test_iter_acgt_geno_w_mode(self):
         """Tests that an exception is raised if in write mode."""
@@ -576,7 +589,7 @@ def test_iter_geno_marker(self):
         )
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Testing a single marker
         index = random.randint(0, len(self.markers) - 1)
@@ -584,7 +597,7 @@ def test_iter_geno_marker(self):
         e_geno = self.genotypes[index]
         for marker, geno in self.pedfile.iter_geno_marker(e_marker):
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Adding a marker that doesn't exist
         markers.extend(["unknown_1", "unknown_2"])
@@ -617,7 +630,7 @@ def test_iter_acgt_geno_marker(self):
         )
         for (e_marker, e_geno), (marker, geno) in zipped:
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Testing a single marker
         index = random.randint(0, len(self.markers) - 1)
@@ -625,7 +638,7 @@ def test_iter_acgt_geno_marker(self):
         e_geno = self.acgt_genotypes[index]
         for marker, geno in self.pedfile.iter_acgt_geno_marker(e_marker):
             self.assertEqual(e_marker, marker)
-            self.assertTrue((e_geno == geno).all())
+            np.testing.assert_array_equal(e_geno, geno)
 
         # Adding a marker that doesn't exist
         markers.extend(["unknown_3", "unknown_4"])
@@ -670,14 +683,10 @@ def test_repr_w_mode(self):
         e_repr = 'PyPlink(mode="w")'
 
         # Creating the dummy PyPlink object
-        pedfile = PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w")
-
-        # Comparing the expected with the observed representation
-        o_repr = str(pedfile)
-        self.assertEqual(e_repr, o_repr)
-
-        # Closing the file
-        pedfile.close()
+        with PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") as pedfile:
+            # Comparing the expected with the observed representation
+            o_repr = str(pedfile)
+            self.assertEqual(e_repr, o_repr)
 
     def test_get_geno_marker(self):
         """Tests the 'get_geno_marker' function."""
@@ -688,7 +697,7 @@ def test_get_geno_marker(self):
 
         # Getting the genotype
         o_geno = self.pedfile.get_geno_marker(marker)
-        self.assertTrue((o_geno == e_geno).all())
+        np.testing.assert_array_equal(o_geno, e_geno)
 
         # Asking for an unknown marker should raise an ValueError
         with self.assertRaises(ValueError) as cm:
@@ -703,7 +712,7 @@ def test_get_geno_marker_w_mode(self):
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
             with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
-                genotypes = p.get_geno_marker("M1")
+                p.get_geno_marker("M1")
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
     def test_get_iter_w_mode(self):
@@ -711,7 +720,7 @@ def test_get_iter_w_mode(self):
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
             with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
-                genotypes = iter(p)
+                iter(p)
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
     def test_get_acgt_geno_marker(self):
@@ -723,7 +732,7 @@ def test_get_acgt_geno_marker(self):
 
         # Getting the genotype
         o_geno = self.pedfile.get_acgt_geno_marker(marker)
-        self.assertTrue((o_geno == e_geno).all())
+        np.testing.assert_array_equal(o_geno, e_geno)
 
         # Asking for an unknown marker should raise an ValueError
         with self.assertRaises(ValueError) as cm:
@@ -735,7 +744,7 @@ def test_get_acgt_geno_marker_w_mode(self):
         with self.assertRaises(UnsupportedOperation) as cm:
             # Creating the dummy PyPlink object
             with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p:
-                genotypes = p.get_acgt_geno_marker("M1")
+                p.get_acgt_geno_marker("M1")
         self.assertEqual("not available in 'w' mode", str(cm.exception))
 
     def test_get_context_read_mode(self):
@@ -746,7 +755,7 @@ def test_get_context_read_mode(self):
     def test_invalid_mode(self):
         """Tests invalid mode when PyPlink as context manager."""
         with self.assertRaises(ValueError) as cm:
-            genotypes = PyPlink(self.prefix, "u")
+            PyPlink(self.prefix, "u")
         self.assertEqual("invalid mode: 'u'", str(cm.exception))
 
     def test_write_binary(self):
@@ -782,10 +791,10 @@ def test_write_binary(self):
                       file=o_file)
 
         # Reading the written binary file
-        pedfile = PyPlink(test_prefix)
-        for i, (marker, genotypes) in enumerate(pedfile):
-            self.assertEqual("m{}".format(i+1), marker)
-            self.assertTrue((expected_genotypes[i] == genotypes).all())
+        with PyPlink(test_prefix) as pedfile:
+            for i, (marker, genotypes) in enumerate(pedfile):
+                self.assertEqual("m{}".format(i+1), marker)
+                np.testing.assert_array_equal(expected_genotypes[i], genotypes)
 
     def test_write_binary_error(self):
         """Tests writing a binary file, with different number of sample."""

From 2fcdda2429e5aa33e1f838911ddda49cfef0d29a Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:06:53 -0500
Subject: [PATCH 03/15] Added python 3.6 to testing, and pip install
 requirements to be faster for some python version

---
 .travis.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 07b72bd..33cf18c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,11 @@ python:
   - "3.3"
   - "3.4"
   - "3.5"
+  - "3.6"
 install:
+  - pip install six
+  - pip install numpy
+  - pip install pandas
   - pip install coveralls
 script:
   - coverage run setup.py test

From 711807ed45b0900e0723cc36e0d4ccff6a0424d3 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:15:29 -0500
Subject: [PATCH 04/15] Added 3.6 to classifiers

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index eed0f10..10f5ac7 100644
--- a/setup.py
+++ b/setup.py
@@ -65,9 +65,11 @@ def setup_package():
                      "Programming Language :: Python :: 3",
                      "Programming Language :: Python :: 3.3",
                      "Programming Language :: Python :: 3.4",
+                     "Programming Language :: Python :: 3.5",
+                     "Programming Language :: Python :: 3.6",
                      "License :: OSI Approved :: MIT License",
                      "Topic :: Scientific/Engineering :: Bio-Informatics"],
-        keywords="bioinformatics format plink binary",
+        keywords="bioinformatics format Plink binary",
     )
 
     return

From 732e1fc86383d16d385645235932c64fa0f2b5dd Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:22:06 -0500
Subject: [PATCH 05/15] Ran the demo, just to be safe

---
 demo/PyPlink Demo.ipynb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index 8e2f0bb..9eaf1bb 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -1146,7 +1146,7 @@
       "\n",
       "Skipping web check... [ --noweb ] \n",
       "Writing this text to log file [ plink.log ]\n",
-      "Analysis started: Wed Nov  4 14:50:58 2015\n",
+      "Analysis started: Fri Jan 20 09:20:55 2017\n",
       "\n",
       "Options in effect:\n",
       "\t--noweb\n",
@@ -1171,7 +1171,7 @@
       "10 founders and 0 non-founders found\n",
       "Writing allele frequencies (founders-only) to [ plink.frq ] \n",
       "\n",
-      "Analysis finished: Wed Nov  4 14:50:58 2015\n",
+      "Analysis finished: Fri Jan 20 09:20:55 2017\n",
       "\n"
      ]
     }
@@ -1287,7 +1287,7 @@
       "\n",
       "Skipping web check... [ --noweb ] \n",
       "Writing this text to log file [ plink_2.log ]\n",
-      "Analysis started: Wed Nov  4 14:50:58 2015\n",
+      "Analysis started: Fri Jan 20 09:21:10 2017\n",
       "\n",
       "Options in effect:\n",
       "\t--noweb\n",
@@ -1314,7 +1314,7 @@
       "10 founders and 0 non-founders found\n",
       "Writing allele frequencies (founders-only) to [ plink_2.frq ] \n",
       "\n",
-      "Analysis finished: Wed Nov  4 14:50:58 2015\n",
+      "Analysis finished: Fri Jan 20 09:21:10 2017\n",
       "\n"
      ]
     }
@@ -1370,7 +1370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.0"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,

From 2f2e9a019f00aee4037f01387b9551c05af69524 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:45:21 -0500
Subject: [PATCH 06/15] Now check if BED might be corrupted by looking last
 entry

---
 pyplink/pyplink.py            | 16 +++++++++++++---
 pyplink/tests/test_pyplink.py | 26 ++++++++++++--------------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 7a37b5f..40ae267 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -285,7 +285,11 @@ def _read_bed(self):
         if (self._bim is None) or (self._fam is None):
             raise RuntimeError("no BIM or FAM file were read")
 
-        # Checking the file is valid by looking at the first 3 bytes
+        # The number of bytes per marker
+        self._nb_bytes = int(np.ceil(self._nb_samples / 4.0))
+
+        # Checking the file is valid by looking at the first 3 bytes and the
+        # last entry (correct size)
         with open(self.bed_filename, "rb") as bed_file:
             # Checking that the first two bytes are OK
             if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27):
@@ -297,8 +301,14 @@ def _read_bed(self):
                 raise ValueError("not in SNP-major format (please recode): "
                                  "{}".format(self.bed_filename))
 
-        # The number of bytes per marker
-        self._nb_bytes = int(np.ceil(self._nb_samples / 4.0))
+            # Checking the last entry
+            seek_position = self._get_seek_position(self._bim.iloc[-1, :].i)
+            bed_file.seek(seek_position)
+            geno = self._geno_values[
+                np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8)
+            ].flatten(order="C")[:self._nb_samples]
+            if geno.shape[0] != self._nb_samples:
+                raise ValueError("invalid number of entries: corrupted BED?")
 
         # Opening the file for the rest of the operations (reading 3 bytes)
         self._bed = open(self.bed_filename, "rb")
diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index 2393890..4ee0ea6 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -231,22 +231,20 @@ def test_pyplink_bad_bed(self):
             o_file.write(i_file.read())
 
         # Copying the BIM file
-        new_fam = new_prefix + ".bim"
-        with open(new_fam, "w") as o_file, open(self.fam, "r") as i_file:
+        new_bim = new_prefix + ".bim"
+        with open(new_bim, "w") as o_file, open(self.bim, "r") as i_file:
             o_file.write(i_file.read())
 
-        # TODO: Change to parse the actual file completely (since we no longer
-        # load to memory
-#        # Creating a new BED file with invalid number of bytes
-#        new_bed = new_prefix + ".bed"
-#        with open(new_bed, "wb") as o_file:
-#            o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4]))
-
-#        # This should raise an exception
-#        with self.assertRaises(ValueError) as cm:
-#            PyPlink(new_prefix)
-#        self.assertEqual("invalid number of entries: {}".format(new_bed),
-#                         str(cm.exception))
+        # Creating a new BED file with invalid number of bytes
+        new_bed = new_prefix + ".bed"
+        with open(new_bed, "wb") as o_file:
+            o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4]))
+
+        # This should raise an exception
+        with self.assertRaises(ValueError) as cm:
+            PyPlink(new_prefix)
+        self.assertEqual("invalid number of entries: corrupted BED?",
+                         str(cm.exception))
 
         # Creating a new BED file with invalid first byte
         new_bed = new_prefix + ".bed"

From 3938b8218137524036382f2bbf8f97dcec19ec9f Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:54:08 -0500
Subject: [PATCH 07/15] Simplified the 'iter_acgt_geno' function so that it
 uses 'iter_geno' directly.

---
 pyplink/pyplink.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 40ae267..ad6c1ea 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -333,14 +333,8 @@ def iter_geno(self):
 
     def iter_acgt_geno(self):
         """Iterates over genotypes (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # Seeking back at the beginning of the file
-        self.seek(0)
-
         # Need to iterate over itself, and modify the actual genotypes
-        for i, (marker, geno) in enumerate(self):
+        for i, (marker, geno) in enumerate(self.iter_geno()):
             yield marker, self._allele_encoding[i][geno]
 
     def iter_geno_marker(self, markers):

From 2d003fc5c8e843baf0b2e219f0d77da322c3c75c Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:54:35 -0500
Subject: [PATCH 08/15] Reordered functions

---
 pyplink/pyplink.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index ad6c1ea..5b0fcff 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -364,21 +364,6 @@ def iter_geno_marker(self, markers):
             # Getting the value
             yield snp, self._read_current_marker()
 
-    def get_geno_marker(self, marker):
-        """Gets the genotypes for a given marker."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # Check if the marker exists
-        if marker not in set(self._bim.index):
-            raise ValueError("{}: marker not in BIM".format(marker))
-
-        # Seeking to the correct position
-        seek_position = self._bim.loc[marker, "i"]
-        self.seek(seek_position)
-
-        return self._read_current_marker()
-
     def iter_acgt_geno_marker(self, markers):
         """Iterates over genotypes for given markers (ACGT format)."""
         if self._mode != "r":
@@ -407,6 +392,21 @@ def iter_acgt_geno_marker(self, markers):
             geno = self._read_current_marker()
             yield snp, self._allele_encoding[seek_position][geno]
 
+    def get_geno_marker(self, marker):
+        """Gets the genotypes for a given marker."""
+        if self._mode != "r":
+            raise UnsupportedOperation("not available in 'w' mode")
+
+        # Check if the marker exists
+        if marker not in set(self._bim.index):
+            raise ValueError("{}: marker not in BIM".format(marker))
+
+        # Seeking to the correct position
+        seek_position = self._bim.loc[marker, "i"]
+        self.seek(seek_position)
+
+        return self._read_current_marker()
+
     def get_acgt_geno_marker(self, marker):
         """Gets the genotypes for a given marker (ACGT format)."""
         if self._mode != "r":

From bf047ac966fcc15b595031b832eff9c0d8cdee18 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 09:57:57 -0500
Subject: [PATCH 09/15] Simplified the 'iter_acgt_geno_marker' function so that
 it uses 'iter_geno_marker' directly.

---
 pyplink/pyplink.py | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 5b0fcff..79c6c06 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -366,31 +366,11 @@ def iter_geno_marker(self, markers):
 
     def iter_acgt_geno_marker(self, markers):
         """Iterates over genotypes for given markers (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # If string, we change to list
-        if isinstance(markers, str):
-            markers = [markers]
-
-        # Checking the list of required markers
-        unknown_markers = set(markers) - set(self._bim.index)
-        if len(unknown_markers) > 0:
-            raise ValueError("{}: marker not in BIM".format(
-                sorted(unknown_markers)
-            ))
-
-        # Getting the required markers
-        required_markers = self._bim.loc[markers, :]
-
-        # Then, we iterate
-        for snp, seek_position in required_markers.i.iteritems():
-            # Seeking to the correct position
-            self.seek(seek_position)
-
-            # Getting the genotype and converting to ACGT
-            geno = self._read_current_marker()
-            yield snp, self._allele_encoding[seek_position][geno]
+        # We iterate over the markers
+        for snp, geno in self.iter_geno_marker(markers):
+            # Getting the SNP position and converting to ACGT
+            snp_position = self._bim.loc[snp, "i"]
+            yield snp, self._allele_encoding[snp_position][geno]
 
     def get_geno_marker(self, marker):
         """Gets the genotypes for a given marker."""

From 42024fba7c64f8b571afb7d1f7b96087fbb5435d Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 10:02:27 -0500
Subject: [PATCH 10/15] Simplified the 'get_acgt_geno_marker' function so that
 it uses 'get_geno_marker' directly.

---
 pyplink/pyplink.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 79c6c06..3ca6fcc 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -389,21 +389,14 @@ def get_geno_marker(self, marker):
 
     def get_acgt_geno_marker(self, marker):
         """Gets the genotypes for a given marker (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # Check if the marker exists
-        if marker not in set(self._bim.index):
-            raise ValueError("{}: marker not in BIM".format(marker))
-
-        # Seeking to the correct position
-        seek_position = self._bim.loc[marker, "i"]
-        self.seek(seek_position)
+        # Getting the marker's genotypes
+        geno = self.get_geno_marker(marker)
 
-        # Getting the genotypes and converting them to ACGT
-        geno = self._read_current_marker()
+        # Getting the marker's position
+        snp_position = self._bim.loc[marker, "i"]
 
-        return self._allele_encoding[seek_position][geno]
+        # Returning the ACGT's format of the genotypes
+        return self._allele_encoding[snp_position][geno]
 
     def write_marker(self, genotypes):
         """Deprecated function."""

From 271a6a3f061e8c76a5ef40e337085f0390511e88 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 10:05:53 -0500
Subject: [PATCH 11/15] Made it so that the we doesn't have to relook at the
 SNP position when converting to ACGT format

---
 pyplink/pyplink.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 3ca6fcc..9b03211 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -361,6 +361,9 @@ def iter_geno_marker(self, markers):
             # Seeking to the correct position
             self.seek(seek_position)
 
+            # Setting the current SNP position
+            self._curr_snp_position = seek_position
+
             # Getting the value
             yield snp, self._read_current_marker()
 
@@ -369,8 +372,7 @@ def iter_acgt_geno_marker(self, markers):
         # We iterate over the markers
         for snp, geno in self.iter_geno_marker(markers):
             # Getting the SNP position and converting to ACGT
-            snp_position = self._bim.loc[snp, "i"]
-            yield snp, self._allele_encoding[snp_position][geno]
+            yield snp, self._allele_encoding[self._curr_snp_position][geno]
 
     def get_geno_marker(self, marker):
         """Gets the genotypes for a given marker."""
@@ -385,6 +387,9 @@ def get_geno_marker(self, marker):
         seek_position = self._bim.loc[marker, "i"]
         self.seek(seek_position)
 
+        # Setting the current SNP position
+        self._curr_snp_position = seek_position
+
         return self._read_current_marker()
 
     def get_acgt_geno_marker(self, marker):
@@ -392,11 +397,8 @@ def get_acgt_geno_marker(self, marker):
         # Getting the marker's genotypes
         geno = self.get_geno_marker(marker)
 
-        # Getting the marker's position
-        snp_position = self._bim.loc[marker, "i"]
-
         # Returning the ACGT's format of the genotypes
-        return self._allele_encoding[snp_position][geno]
+        return self._allele_encoding[self._curr_snp_position][geno]
 
     def write_marker(self, genotypes):
         """Deprecated function."""

From 5556500f3a64d7d98baedced645eb0b7661ea169 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 11:06:01 -0500
Subject: [PATCH 12/15] Added parameters to get seek (snp) position required by
 ACGT genotypes conversion.

---
 pyplink/pyplink.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 9b03211..d5cefc0 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -337,7 +337,7 @@ def iter_acgt_geno(self):
         for i, (marker, geno) in enumerate(self.iter_geno()):
             yield marker, self._allele_encoding[i][geno]
 
-    def iter_geno_marker(self, markers):
+    def iter_geno_marker(self, markers, return_seek=False):
         """Iterates over genotypes for given markers."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
@@ -361,20 +361,20 @@ def iter_geno_marker(self, markers):
             # Seeking to the correct position
             self.seek(seek_position)
 
-            # Setting the current SNP position
-            self._curr_snp_position = seek_position
-
             # Getting the value
-            yield snp, self._read_current_marker()
+            if return_seek:
+                yield snp, self._read_current_marker(), seek_position
+            else:
+                yield snp, self._read_current_marker()
 
     def iter_acgt_geno_marker(self, markers):
         """Iterates over genotypes for given markers (ACGT format)."""
         # We iterate over the markers
-        for snp, geno in self.iter_geno_marker(markers):
+        for snp, geno, s in self.iter_geno_marker(markers, return_seek=True):
             # Getting the SNP position and converting to ACGT
-            yield snp, self._allele_encoding[self._curr_snp_position][geno]
+            yield snp, self._allele_encoding[s][geno]
 
-    def get_geno_marker(self, marker):
+    def get_geno_marker(self, marker, return_seek=False):
         """Gets the genotypes for a given marker."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
@@ -387,18 +387,17 @@ def get_geno_marker(self, marker):
         seek_position = self._bim.loc[marker, "i"]
         self.seek(seek_position)
 
-        # Setting the current SNP position
-        self._curr_snp_position = seek_position
-
+        if return_seek:
+            return self._read_current_marker(), seek_position
         return self._read_current_marker()
 
     def get_acgt_geno_marker(self, marker):
         """Gets the genotypes for a given marker (ACGT format)."""
         # Getting the marker's genotypes
-        geno = self.get_geno_marker(marker)
+        geno, snp_position = self.get_geno_marker(marker, return_seek=True)
 
         # Returning the ACGT's format of the genotypes
-        return self._allele_encoding[self._curr_snp_position][geno]
+        return self._allele_encoding[snp_position][geno]
 
     def write_marker(self, genotypes):
         """Deprecated function."""

From 1f541f3fc0f1dc89485eaf4020bb5b3b95740440 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 11:14:28 -0500
Subject: [PATCH 13/15] Updated the README

---
 README.mkd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.mkd b/README.mkd
index b23901e..d9d0e65 100644
--- a/README.mkd
+++ b/README.mkd
@@ -10,8 +10,8 @@
 
 ## Dependencies
 
-The tool requires a standard [Python](http://python.org/) installation (2.7 or
-3.4) with the following modules:
+The tool requires a standard [Python](http://python.org/) installation (2.7 and
+3.3 or higher are supported) with the following modules:
 
 1. [numpy](http://www.numpy.org/) version 1.8.2 or latest
 2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest
@@ -70,9 +70,9 @@ To test the module, just perform the following command:
 ```python
 >>> import pyplink
 >>> pyplink.test()
-............................................
+.............................................
 ----------------------------------------------------------------------
-Ran 44 tests in 0.468s
+Ran 45 tests in 0.480s
 
 OK
 ```

From 01acecc14da137c5b1aa416d98eb1418c8cb8189 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 12:34:26 -0500
Subject: [PATCH 14/15] Updated setup file for next version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 10f5ac7..67c6c82 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 
 
 MAJOR = 1
-MINOR = 2
+MINOR = 3
 MICRO = 0
 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO)
 

From 1837abea6873b6745bcd6b973cf3d1ede1b9e3d4 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Fri, 20 Jan 2017 12:34:40 -0500
Subject: [PATCH 15/15] Updated script for building conda package

---
 conda_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda_build.sh b/conda_build.sh
index a4033be..8f04ac9 100755
--- a/conda_build.sh
+++ b/conda_build.sh
@@ -8,8 +8,8 @@ pushd skeleton
 conda skeleton pypi pyplink
 
 # The different python versions and platforms
-python_versions="2.7 3.3 3.4 3.5"
-platforms="linux-32 linux-64 osx-64 win-32 win-64"
+python_versions="2.7 3.3 3.4 3.5 3.6"
+platforms="all"
 
 # Building
 for python_version in $python_versions