Release 1.3.0

lemieuxl · Jan 20, 2017 · b61ce30 · b61ce30
2 parents 2648dc3 + 1837abe
commit b61ce30
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 148 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,11 @@ python:
   - "3.3"
   - "3.4"
   - "3.5"
+  - "3.6"
 install:
+  - pip install six
+  - pip install numpy
+  - pip install pandas
   - pip install coveralls
 script:
   - coverage run setup.py test

diff --git a/README.mkd b/README.mkd
@@ -10,8 +10,8 @@
 
 ## Dependencies
 
-The tool requires a standard [Python](http://python.org/) installation (2.7 or
-3.4) with the following modules:
+The tool requires a standard [Python](http://python.org/) installation (2.7 and
+3.3 or higher are supported) with the following modules:
 
 1. [numpy](http://www.numpy.org/) version 1.8.2 or latest
 2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest
@@ -70,9 +70,9 @@ To test the module, just perform the following command:
 ```python
 >>> import pyplink
 >>> pyplink.test()
-............................................
+.............................................
 ----------------------------------------------------------------------
-Ran 44 tests in 0.468s
+Ran 45 tests in 0.480s
 
 OK
 ```

diff --git a/conda_build.sh b/conda_build.sh
@@ -8,8 +8,8 @@ pushd skeleton
 conda skeleton pypi pyplink
 
 # The different python versions and platforms
-python_versions="2.7 3.3 3.4 3.5"
-platforms="linux-32 linux-64 osx-64 win-32 win-64"
+python_versions="2.7 3.3 3.4 3.5 3.6"
+platforms="all"
 
 # Building
 for python_version in $python_versions

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
@@ -1146,7 +1146,7 @@
       "\n",
       "Skipping web check... [ --noweb ] \n",
       "Writing this text to log file [ plink.log ]\n",
-      "Analysis started: Wed Nov  4 14:50:58 2015\n",
+      "Analysis started: Fri Jan 20 09:20:55 2017\n",
       "\n",
       "Options in effect:\n",
       "\t--noweb\n",
@@ -1171,7 +1171,7 @@
       "10 founders and 0 non-founders found\n",
       "Writing allele frequencies (founders-only) to [ plink.frq ] \n",
       "\n",
-      "Analysis finished: Wed Nov  4 14:50:58 2015\n",
+      "Analysis finished: Fri Jan 20 09:20:55 2017\n",
       "\n"
      ]
     }
@@ -1287,7 +1287,7 @@
       "\n",
       "Skipping web check... [ --noweb ] \n",
       "Writing this text to log file [ plink_2.log ]\n",
-      "Analysis started: Wed Nov  4 14:50:58 2015\n",
+      "Analysis started: Fri Jan 20 09:21:10 2017\n",
       "\n",
       "Options in effect:\n",
       "\t--noweb\n",
@@ -1314,7 +1314,7 @@
       "10 founders and 0 non-founders found\n",
       "Writing allele frequencies (founders-only) to [ plink_2.frq ] \n",
       "\n",
-      "Analysis finished: Wed Nov  4 14:50:58 2015\n",
+      "Analysis finished: Fri Jan 20 09:21:10 2017\n",
       "\n"
      ]
     }
@@ -1370,7 +1370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.0"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
@@ -125,7 +125,7 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"):
             self._nb_values = None
 
             # Opening the output BED file
-            self._bed_file = open(self.bed_filename, "wb")
+            self._bed = open(self.bed_filename, "wb")
             self._write_bed_header()
 
         else:
@@ -162,36 +162,43 @@ def __exit__(self, *args):
 
     def close(self):
         """Closes the BED file if required."""
-        if self._mode == "w":
-            # Closing the BED file
-            self._bed_file.close()
+        # Closing the BED file
+        self._bed.close()
 
     def next(self):
         """The next function."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        if self._n < self._nb_markers:
-            self._n += 1
-
-            # We want to return information about the marker and the genotypes
-            geno = self._geno_values[self._bed[self._n - 1]].flatten(order="C")
-            return self._markers[self._n - 1], geno[:self._nb_samples]
-        else:
+        self._n += 1
+        if self._n > self._nb_markers:
             raise StopIteration()
 
+        return self._markers[self._n - 1], self._read_current_marker()
+
+    def _read_current_marker(self):
+        """Reads the current marker and returns its genotypes."""
+        return self._geno_values[
+            np.fromstring(self._bed.read(self._nb_bytes), dtype=np.uint8)
+        ].flatten(order="C")[:self._nb_samples]
+
     def seek(self, n):
         """Gets to a certain position in the BED file when iterating."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        if 0 <= n < len(self._bed):
+        if 0 <= n < self._nb_markers:
             self._n = n
+            self._bed.seek(self._get_seek_position(n))
 
         else:
             # Invalid seek value
             raise ValueError("invalid position in BED: {}".format(n))
 
+    def _get_seek_position(self, n):
+        """Gets the seek position in the file (including special bytes)."""
+        return 3 + self._nb_bytes * n
+
     def _read_bim(self):
         """Reads the BIM file."""
         # Reading the BIM file and setting the values
@@ -204,7 +211,7 @@ def _read_bim(self):
         bim["a2"] = bim["a2"].astype(str)
 
         bim = bim.set_index("snp")
-        bim["i"] = range(len(bim))
+        bim["i"] = range(bim.shape[0])
         bim[2] = bim.a1 * 2           # Original '0'
         bim[1] = bim.a1 + bim.a2      # Original '2'
         bim[0] = bim.a2 * 2           # Original '3'
@@ -219,7 +226,7 @@ def _read_bim(self):
 
         # Saving the data in the object
         self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]]
-        self._nb_markers = len(self._bim)
+        self._nb_markers = self._bim.shape[0]
         self._markers = np.array(list(self._bim.index))
 
     def get_bim(self):
@@ -256,7 +263,7 @@ def _read_fam(self):
 
         # Saving the data in the object
         self._fam = fam
-        self._nb_samples = len(self._fam)
+        self._nb_samples = self._fam.shape[0]
 
     def get_fam(self):
         """Returns the FAM file."""
@@ -278,7 +285,11 @@ def _read_bed(self):
         if (self._bim is None) or (self._fam is None):
             raise RuntimeError("no BIM or FAM file were read")
 
-        data = None
+        # The number of bytes per marker
+        self._nb_bytes = int(np.ceil(self._nb_samples / 4.0))
+
+        # Checking the file is valid by looking at the first 3 bytes and the
+        # last entry (correct size)
         with open(self.bed_filename, "rb") as bed_file:
             # Checking that the first two bytes are OK
             if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27):
@@ -290,47 +301,43 @@ def _read_bed(self):
                 raise ValueError("not in SNP-major format (please recode): "
                                  "{}".format(self.bed_filename))
 
-            # The number of bytes per marker
-            nb_bytes = int(np.ceil(self._nb_samples / 4.0))
+            # Checking the last entry
+            seek_position = self._get_seek_position(self._bim.iloc[-1, :].i)
+            bed_file.seek(seek_position)
+            geno = self._geno_values[
+                np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8)
+            ].flatten(order="C")[:self._nb_samples]
+            if geno.shape[0] != self._nb_samples:
+                raise ValueError("invalid number of entries: corrupted BED?")
 
-            # Reading the data
-            data = np.fromfile(bed_file, dtype=np.uint8)
-
-            # Checking the data
-            if data.shape[0] != (self._nb_markers * nb_bytes):
-                raise ValueError("invalid number of entries: "
-                                 "{}".format(self.bed_filename))
-            data.shape = (self._nb_markers, nb_bytes)
-
-        # Saving the data in the object
-        self._bed = data
+        # Opening the file for the rest of the operations (reading 3 bytes)
+        self._bed = open(self.bed_filename, "rb")
+        self._bed.read(3)
 
     def _write_bed_header(self):
         """Writes the BED first 3 bytes."""
         # Writing the first three bytes
         final_byte = 1 if self._bed_format == "SNP-major" else 0
-        self._bed_file.write(bytearray((108, 27, final_byte)))
+        self._bed.write(bytearray((108, 27, final_byte)))
 
     def iter_geno(self):
-        """Iterates over genotypes."""
+        """Iterates over genotypes from the beginning."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
 
-        for i in range(len(self._bed)):
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield self._markers[i], geno[:self._nb_samples]
+        # Seeking back at the beginning of the file
+        self.seek(0)
+
+        # Return itself (the generator)
+        return self
 
     def iter_acgt_geno(self):
         """Iterates over genotypes (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        for i in range(len(self._bed)):
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield (self._markers[i],
-                   self._allele_encoding[i][geno[:self._nb_samples]])
+        # Need to iterate over itself, and modify the actual genotypes
+        for i, (marker, geno) in enumerate(self.iter_geno()):
+            yield marker, self._allele_encoding[i][geno]
 
-    def iter_geno_marker(self, markers):
+    def iter_geno_marker(self, markers, return_seek=False):
         """Iterates over genotypes for given markers."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
@@ -350,11 +357,24 @@ def iter_geno_marker(self, markers):
         required_markers = self._bim.loc[markers, :]
 
         # Then, we iterate
-        for snp, i in required_markers.i.iteritems():
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield snp, geno[:self._nb_samples]
+        for snp, seek_position in required_markers.i.iteritems():
+            # Seeking to the correct position
+            self.seek(seek_position)
+
+            # Getting the value
+            if return_seek:
+                yield snp, self._read_current_marker(), seek_position
+            else:
+                yield snp, self._read_current_marker()
+
+    def iter_acgt_geno_marker(self, markers):
+        """Iterates over genotypes for given markers (ACGT format)."""
+        # We iterate over the markers
+        for snp, geno, s in self.iter_geno_marker(markers, return_seek=True):
+            # Getting the SNP position and converting to ACGT
+            yield snp, self._allele_encoding[s][geno]
 
-    def get_geno_marker(self, marker):
+    def get_geno_marker(self, marker, return_seek=False):
         """Gets the genotypes for a given marker."""
         if self._mode != "r":
             raise UnsupportedOperation("not available in 'w' mode")
@@ -363,50 +383,21 @@ def get_geno_marker(self, marker):
         if marker not in set(self._bim.index):
             raise ValueError("{}: marker not in BIM".format(marker))
 
-        # Getting all the genotypes
-        i = self._bim.loc[marker, "i"]
-        geno = self._geno_values[self._bed[i]].flatten(order="C")
-
-        return geno[:self._nb_samples]
+        # Seeking to the correct position
+        seek_position = self._bim.loc[marker, "i"]
+        self.seek(seek_position)
 
-    def iter_acgt_geno_marker(self, markers):
-        """Iterates over genotypes for given markers (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # If string, we change to list
-        if isinstance(markers, str):
-            markers = [markers]
-
-        # Checking the list of required markers
-        unknown_markers = set(markers) - set(self._bim.index)
-        if len(unknown_markers) > 0:
-            raise ValueError("{}: marker not in BIM".format(
-                sorted(unknown_markers)
-            ))
-
-        # Getting the required markers
-        required_markers = self._bim.loc[markers, :]
-
-        # Then, we iterate
-        for snp, i in required_markers.i.iteritems():
-            geno = self._geno_values[self._bed[i]].flatten(order="C")
-            yield snp, self._allele_encoding[i][geno[:self._nb_samples]]
+        if return_seek:
+            return self._read_current_marker(), seek_position
+        return self._read_current_marker()
 
     def get_acgt_geno_marker(self, marker):
         """Gets the genotypes for a given marker (ACGT format)."""
-        if self._mode != "r":
-            raise UnsupportedOperation("not available in 'w' mode")
-
-        # Check if the marker exists
-        if marker not in set(self._bim.index):
-            raise ValueError("{}: marker not in BIM".format(marker))
-
-        # Getting all the genotypes
-        i = self._bim.loc[marker, "i"]
-        geno = self._geno_values[self._bed[i]].flatten(order="C")
+        # Getting the marker's genotypes
+        geno, snp_position = self.get_geno_marker(marker, return_seek=True)
 
-        return self._allele_encoding[i][geno[:self._nb_samples]]
+        # Returning the ACGT's format of the genotypes
+        return self._allele_encoding[snp_position][geno]
 
     def write_marker(self, genotypes):
         """Deprecated function."""
@@ -434,7 +425,7 @@ def write_genotypes(self, genotypes):
             g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in
             self._grouper((_byte_recode[geno] for geno in genotypes), 4)
         ]
-        self._bed_file.write(bytearray(byte_array))
+        self._bed.write(bytearray(byte_array))
 
     @staticmethod
     def _grouper(iterable, n, fillvalue=0):