From ab21492a1ba92fdda7417c440661d96e1b145ed8 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:01:47 -0500 Subject: [PATCH 01/15] Refactor so that PyPlink doesn't load all into memory (for large datasets) --- pyplink/pyplink.py | 119 +++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index ba52dc9..7a37b5f 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -125,7 +125,7 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"): self._nb_values = None # Opening the output BED file - self._bed_file = open(self.bed_filename, "wb") + self._bed = open(self.bed_filename, "wb") self._write_bed_header() else: @@ -162,36 +162,43 @@ def __exit__(self, *args): def close(self): """Closes the BED file if required.""" - if self._mode == "w": - # Closing the BED file - self._bed_file.close() + # Closing the BED file + self._bed.close() def next(self): """The next function.""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") - if self._n < self._nb_markers: - self._n += 1 - - # We want to return information about the marker and the genotypes - geno = self._geno_values[self._bed[self._n - 1]].flatten(order="C") - return self._markers[self._n - 1], geno[:self._nb_samples] - else: + self._n += 1 + if self._n > self._nb_markers: raise StopIteration() + return self._markers[self._n - 1], self._read_current_marker() + + def _read_current_marker(self): + """Reads the current marker and returns its genotypes.""" + return self._geno_values[ + np.fromstring(self._bed.read(self._nb_bytes), dtype=np.uint8) + ].flatten(order="C")[:self._nb_samples] + def seek(self, n): """Gets to a certain position in the BED file when iterating.""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") - if 0 <= n < len(self._bed): + if 0 <= n < self._nb_markers: self._n = n + self._bed.seek(self._get_seek_position(n)) else: # Invalid seek value raise ValueError("invalid position in BED: {}".format(n)) + def _get_seek_position(self, n): + """Gets the seek position in the file (including special bytes).""" + return 3 + self._nb_bytes * n + def _read_bim(self): """Reads the BIM file.""" # Reading the BIM file and setting the values @@ -204,7 +211,7 @@ def _read_bim(self): bim["a2"] = bim["a2"].astype(str) bim = bim.set_index("snp") - bim["i"] = range(len(bim)) + bim["i"] = range(bim.shape[0]) bim[2] = bim.a1 * 2 # Original '0' bim[1] = bim.a1 + bim.a2 # Original '2' bim[0] = bim.a2 * 2 # Original '3' @@ -219,7 +226,7 @@ def _read_bim(self): # Saving the data in the object self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]] - self._nb_markers = len(self._bim) + self._nb_markers = self._bim.shape[0] self._markers = np.array(list(self._bim.index)) def get_bim(self): @@ -256,7 +263,7 @@ def _read_fam(self): # Saving the data in the object self._fam = fam - self._nb_samples = len(self._fam) + self._nb_samples = self._fam.shape[0] def get_fam(self): """Returns the FAM file.""" @@ -278,7 +285,7 @@ def _read_bed(self): if (self._bim is None) or (self._fam is None): raise RuntimeError("no BIM or FAM file were read") - data = None + # Checking the file is valid by looking at the first 3 bytes with open(self.bed_filename, "rb") as bed_file: # Checking that the first two bytes are OK if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27): @@ -290,45 +297,41 @@ def _read_bed(self): raise ValueError("not in SNP-major format (please recode): " "{}".format(self.bed_filename)) - # The number of bytes per marker - nb_bytes = int(np.ceil(self._nb_samples / 4.0)) - - # Reading the data - data = np.fromfile(bed_file, dtype=np.uint8) + # The number of bytes per marker + self._nb_bytes = int(np.ceil(self._nb_samples / 4.0)) - # Checking the data - if data.shape[0] != (self._nb_markers * nb_bytes): - raise ValueError("invalid number of entries: " - "{}".format(self.bed_filename)) - data.shape = (self._nb_markers, nb_bytes) - - # Saving the data in the object - self._bed = data + # Opening the file for the rest of the operations (reading 3 bytes) + self._bed = open(self.bed_filename, "rb") + self._bed.read(3) def _write_bed_header(self): """Writes the BED first 3 bytes.""" # Writing the first three bytes final_byte = 1 if self._bed_format == "SNP-major" else 0 - self._bed_file.write(bytearray((108, 27, final_byte))) + self._bed.write(bytearray((108, 27, final_byte))) def iter_geno(self): - """Iterates over genotypes.""" + """Iterates over genotypes from the beginning.""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") - for i in range(len(self._bed)): - geno = self._geno_values[self._bed[i]].flatten(order="C") - yield self._markers[i], geno[:self._nb_samples] + # Seeking back at the beginning of the file + self.seek(0) + + # Return itself (the generator) + return self def iter_acgt_geno(self): """Iterates over genotypes (ACGT format).""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") - for i in range(len(self._bed)): - geno = self._geno_values[self._bed[i]].flatten(order="C") - yield (self._markers[i], - self._allele_encoding[i][geno[:self._nb_samples]]) + # Seeking back at the beginning of the file + self.seek(0) + + # Need to iterate over itself, and modify the actual genotypes + for i, (marker, geno) in enumerate(self): + yield marker, self._allele_encoding[i][geno] def iter_geno_marker(self, markers): """Iterates over genotypes for given markers.""" @@ -350,9 +353,12 @@ def iter_geno_marker(self, markers): required_markers = self._bim.loc[markers, :] # Then, we iterate - for snp, i in required_markers.i.iteritems(): - geno = self._geno_values[self._bed[i]].flatten(order="C") - yield snp, geno[:self._nb_samples] + for snp, seek_position in required_markers.i.iteritems(): + # Seeking to the correct position + self.seek(seek_position) + + # Getting the value + yield snp, self._read_current_marker() def get_geno_marker(self, marker): """Gets the genotypes for a given marker.""" @@ -363,11 +369,11 @@ def get_geno_marker(self, marker): if marker not in set(self._bim.index): raise ValueError("{}: marker not in BIM".format(marker)) - # Getting all the genotypes - i = self._bim.loc[marker, "i"] - geno = self._geno_values[self._bed[i]].flatten(order="C") + # Seeking to the correct position + seek_position = self._bim.loc[marker, "i"] + self.seek(seek_position) - return geno[:self._nb_samples] + return self._read_current_marker() def iter_acgt_geno_marker(self, markers): """Iterates over genotypes for given markers (ACGT format).""" @@ -389,9 +395,13 @@ def iter_acgt_geno_marker(self, markers): required_markers = self._bim.loc[markers, :] # Then, we iterate - for snp, i in required_markers.i.iteritems(): - geno = self._geno_values[self._bed[i]].flatten(order="C") - yield snp, self._allele_encoding[i][geno[:self._nb_samples]] + for snp, seek_position in required_markers.i.iteritems(): + # Seeking to the correct position + self.seek(seek_position) + + # Getting the genotype and converting to ACGT + geno = self._read_current_marker() + yield snp, self._allele_encoding[seek_position][geno] def get_acgt_geno_marker(self, marker): """Gets the genotypes for a given marker (ACGT format).""" @@ -402,11 +412,14 @@ def get_acgt_geno_marker(self, marker): if marker not in set(self._bim.index): raise ValueError("{}: marker not in BIM".format(marker)) - # Getting all the genotypes - i = self._bim.loc[marker, "i"] - geno = self._geno_values[self._bed[i]].flatten(order="C") + # Seeking to the correct position + seek_position = self._bim.loc[marker, "i"] + self.seek(seek_position) + + # Getting the genotypes and converting them to ACGT + geno = self._read_current_marker() - return self._allele_encoding[i][geno[:self._nb_samples]] + return self._allele_encoding[seek_position][geno] def write_marker(self, genotypes): """Deprecated function.""" @@ -434,7 +447,7 @@ def write_genotypes(self, genotypes): g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in self._grouper((_byte_recode[geno] for geno in genotypes), 4) ] - self._bed_file.write(bytearray(byte_array)) + self._bed.write(bytearray(byte_array)) @staticmethod def _grouper(iterable, n, fillvalue=0): From ea4382e5d86e6316c500bd0ce51d3dc3eae74496 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:02:19 -0500 Subject: [PATCH 02/15] Updated tests to work with current changes --- pyplink/tests/test_pyplink.py | 119 ++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index c07dc06..2393890 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -98,7 +98,6 @@ def get_plink(tmp_dir): return None, "Plink's URL is not available" # Unzipping Plink - plink_dir = os.path.join(tmp_dir, ) with zipfile.ZipFile(zip_path, "r") as z: z.extractall(tmp_dir) plink_path = os.path.join(tmp_dir, os.path.splitext(filename)[0], @@ -177,16 +176,16 @@ def tearDownClass(cls): # Cleaning the temporary directory shutil.rmtree(cls.tmp_dir) + def tearDown(self): + # Closing the PyPlink object + self.pedfile.close() + def test_pyplink_object_integrity(self): """Checks the integrity of the PyPlink object.""" # Checking the name of the BED file self.assertTrue(hasattr(self.pedfile, "bed_filename")) self.assertEqual(self.bed, self.pedfile.bed_filename) - # Checking the BED object - self.assertTrue(hasattr(self.pedfile, "_bed")) - self.assertTrue(isinstance(self.pedfile._bed, np.ndarray)) - # Checking the name of the BIM file self.assertTrue(hasattr(self.pedfile, "bim_filename")) self.assertEqual(self.bim, self.pedfile.bim_filename) @@ -205,24 +204,21 @@ def test_pyplink_object_integrity(self): def test_pyplink_object_error(self): """Checks what happens when we play with the PyPlink object.""" - # Creating a new object - data = PyPlink(self.prefix) - # Changing the BIM to None - ori = data._bim - data._bim = None + ori = self.pedfile._bim + self.pedfile._bim = None with self.assertRaises(RuntimeError) as cm: - data._read_bed() + self.pedfile._read_bed() self.assertEqual("no BIM or FAM file were read", str(cm.exception)) - data._bim = ori + self.pedfile._bim = ori # Changing the FAM to None - ori = data._fam - data._fam = None + ori = self.pedfile._fam + self.pedfile._fam = None with self.assertRaises(RuntimeError) as cm: - data._read_bed() + self.pedfile._read_bed() self.assertEqual("no BIM or FAM file were read", str(cm.exception)) - data._fam = ori + self.pedfile._fam = ori def test_pyplink_bad_bed(self): """Checks what happens when we read a bad BED file.""" @@ -239,16 +235,18 @@ def test_pyplink_bad_bed(self): with open(new_fam, "w") as o_file, open(self.fam, "r") as i_file: o_file.write(i_file.read()) - # Creating a new BED file with invalid number of bytes - new_bed = new_prefix + ".bed" - with open(new_bed, "wb") as o_file: - o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4])) + # TODO: Change to parse the actual file completely (since we no longer + # load to memory +# # Creating a new BED file with invalid number of bytes +# new_bed = new_prefix + ".bed" +# with open(new_bed, "wb") as o_file: +# o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4])) - # This should raise an exception - with self.assertRaises(ValueError) as cm: - PyPlink(new_prefix) - self.assertEqual("invalid number of entries: {}".format(new_bed), - str(cm.exception)) +# # This should raise an exception +# with self.assertRaises(ValueError) as cm: +# PyPlink(new_prefix) +# self.assertEqual("invalid number of entries: {}".format(new_bed), +# str(cm.exception)) # Creating a new BED file with invalid first byte new_bed = new_prefix + ".bed" @@ -290,7 +288,7 @@ def test_missing_files(self): # Creating dummy BED/BIM/FAM files prefix = os.path.join(self.tmp_dir, "test_missing") for extension in (".bed", ".bim", ".fam"): - with open(prefix + extension, "w") as o_file: + with open(prefix + extension, "w"): pass # Removing the files (one by one) and checking the exception is raised @@ -300,7 +298,7 @@ def test_missing_files(self): PyPlink(prefix) self.assertEqual("No such file: '{}'".format(prefix + extension), str(cm.exception)) - with open(prefix + extension, "w") as o_file: + with open(prefix + extension, "w"): pass def test_get_nb_markers(self): @@ -447,7 +445,7 @@ def test_generator(self): ) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # The generator should be empty remaining = [(marker, geno) for marker, geno in self.pedfile] @@ -467,7 +465,7 @@ def test_next(self): # Comparing self.assertEqual(self.markers[0], marker) - self.assertTrue((self.genotypes[0] == genotypes).all()) + np.testing.assert_array_equal(self.genotypes[0], genotypes) def test_next_w_mode(self): """Tests that an exception is raised when calling next in w mode.""" @@ -494,7 +492,7 @@ def test_seek(self): self.pedfile.seek(1) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # Seeking at the fourth position zipped = zip( @@ -504,7 +502,17 @@ def test_seek(self): self.pedfile.seek(3) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) + + # Seeking at the tenth position + zipped = zip( + [i for i in zip(self.markers[9:], self.genotypes[9:])], + self.pedfile, + ) + self.pedfile.seek(9) + for (e_marker, e_geno), (marker, geno) in zipped: + self.assertEqual(e_marker, marker) + np.testing.assert_array_equal(e_geno, geno) # Seeking at an invalid position with self.assertRaises(ValueError) as cm: @@ -516,6 +524,11 @@ def test_seek(self): self.pedfile.seek(100) self.assertEqual("invalid position in BED: 100", str(cm.exception)) + # Seeking at an invalid position + with self.assertRaises(ValueError) as cm: + self.pedfile.seek(10) + self.assertEqual("invalid position in BED: 10", str(cm.exception)) + def test_seek_w_mode(self): """Tests that an exception is raised if in write mode.""" with self.assertRaises(UnsupportedOperation) as cm: @@ -532,7 +545,7 @@ def test_iter_geno(self): ) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) def test_iter_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" @@ -550,7 +563,7 @@ def test_iter_acgt_geno(self): ) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) def test_iter_acgt_geno_w_mode(self): """Tests that an exception is raised if in write mode.""" @@ -576,7 +589,7 @@ def test_iter_geno_marker(self): ) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # Testing a single marker index = random.randint(0, len(self.markers) - 1) @@ -584,7 +597,7 @@ def test_iter_geno_marker(self): e_geno = self.genotypes[index] for marker, geno in self.pedfile.iter_geno_marker(e_marker): self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # Adding a marker that doesn't exist markers.extend(["unknown_1", "unknown_2"]) @@ -617,7 +630,7 @@ def test_iter_acgt_geno_marker(self): ) for (e_marker, e_geno), (marker, geno) in zipped: self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # Testing a single marker index = random.randint(0, len(self.markers) - 1) @@ -625,7 +638,7 @@ def test_iter_acgt_geno_marker(self): e_geno = self.acgt_genotypes[index] for marker, geno in self.pedfile.iter_acgt_geno_marker(e_marker): self.assertEqual(e_marker, marker) - self.assertTrue((e_geno == geno).all()) + np.testing.assert_array_equal(e_geno, geno) # Adding a marker that doesn't exist markers.extend(["unknown_3", "unknown_4"]) @@ -670,14 +683,10 @@ def test_repr_w_mode(self): e_repr = 'PyPlink(mode="w")' # Creating the dummy PyPlink object - pedfile = PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") - - # Comparing the expected with the observed representation - o_repr = str(pedfile) - self.assertEqual(e_repr, o_repr) - - # Closing the file - pedfile.close() + with PyPlink(os.path.join(self.tmp_dir, "test_repr"), "w") as pedfile: + # Comparing the expected with the observed representation + o_repr = str(pedfile) + self.assertEqual(e_repr, o_repr) def test_get_geno_marker(self): """Tests the 'get_geno_marker' function.""" @@ -688,7 +697,7 @@ def test_get_geno_marker(self): # Getting the genotype o_geno = self.pedfile.get_geno_marker(marker) - self.assertTrue((o_geno == e_geno).all()) + np.testing.assert_array_equal(o_geno, e_geno) # Asking for an unknown marker should raise an ValueError with self.assertRaises(ValueError) as cm: @@ -703,7 +712,7 @@ def test_get_geno_marker_w_mode(self): with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: - genotypes = p.get_geno_marker("M1") + p.get_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) def test_get_iter_w_mode(self): @@ -711,7 +720,7 @@ def test_get_iter_w_mode(self): with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: - genotypes = iter(p) + iter(p) self.assertEqual("not available in 'w' mode", str(cm.exception)) def test_get_acgt_geno_marker(self): @@ -723,7 +732,7 @@ def test_get_acgt_geno_marker(self): # Getting the genotype o_geno = self.pedfile.get_acgt_geno_marker(marker) - self.assertTrue((o_geno == e_geno).all()) + np.testing.assert_array_equal(o_geno, e_geno) # Asking for an unknown marker should raise an ValueError with self.assertRaises(ValueError) as cm: @@ -735,7 +744,7 @@ def test_get_acgt_geno_marker_w_mode(self): with self.assertRaises(UnsupportedOperation) as cm: # Creating the dummy PyPlink object with PyPlink(os.path.join(self.tmp_dir, "test_error"), "w") as p: - genotypes = p.get_acgt_geno_marker("M1") + p.get_acgt_geno_marker("M1") self.assertEqual("not available in 'w' mode", str(cm.exception)) def test_get_context_read_mode(self): @@ -746,7 +755,7 @@ def test_get_context_read_mode(self): def test_invalid_mode(self): """Tests invalid mode when PyPlink as context manager.""" with self.assertRaises(ValueError) as cm: - genotypes = PyPlink(self.prefix, "u") + PyPlink(self.prefix, "u") self.assertEqual("invalid mode: 'u'", str(cm.exception)) def test_write_binary(self): @@ -782,10 +791,10 @@ def test_write_binary(self): file=o_file) # Reading the written binary file - pedfile = PyPlink(test_prefix) - for i, (marker, genotypes) in enumerate(pedfile): - self.assertEqual("m{}".format(i+1), marker) - self.assertTrue((expected_genotypes[i] == genotypes).all()) + with PyPlink(test_prefix) as pedfile: + for i, (marker, genotypes) in enumerate(pedfile): + self.assertEqual("m{}".format(i+1), marker) + np.testing.assert_array_equal(expected_genotypes[i], genotypes) def test_write_binary_error(self): """Tests writing a binary file, with different number of sample.""" From 2fcdda2429e5aa33e1f838911ddda49cfef0d29a Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:06:53 -0500 Subject: [PATCH 03/15] Added python 3.6 to testing, and pip install requirements to be faster for some python version --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 07b72bd..33cf18c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,11 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" install: + - pip install six + - pip install numpy + - pip install pandas - pip install coveralls script: - coverage run setup.py test From 711807ed45b0900e0723cc36e0d4ccff6a0424d3 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:15:29 -0500 Subject: [PATCH 04/15] Added 3.6 to classifiers --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eed0f10..10f5ac7 100644 --- a/setup.py +++ b/setup.py @@ -65,9 +65,11 @@ def setup_package(): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics"], - keywords="bioinformatics format plink binary", + keywords="bioinformatics format Plink binary", ) return From 732e1fc86383d16d385645235932c64fa0f2b5dd Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:22:06 -0500 Subject: [PATCH 05/15] Ran the demo, just to be safe --- demo/PyPlink Demo.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb index 8e2f0bb..9eaf1bb 100644 --- a/demo/PyPlink Demo.ipynb +++ b/demo/PyPlink Demo.ipynb @@ -1146,7 +1146,7 @@ "\n", "Skipping web check... [ --noweb ] \n", "Writing this text to log file [ plink.log ]\n", - "Analysis started: Wed Nov 4 14:50:58 2015\n", + "Analysis started: Fri Jan 20 09:20:55 2017\n", "\n", "Options in effect:\n", "\t--noweb\n", @@ -1171,7 +1171,7 @@ "10 founders and 0 non-founders found\n", "Writing allele frequencies (founders-only) to [ plink.frq ] \n", "\n", - "Analysis finished: Wed Nov 4 14:50:58 2015\n", + "Analysis finished: Fri Jan 20 09:20:55 2017\n", "\n" ] } @@ -1287,7 +1287,7 @@ "\n", "Skipping web check... [ --noweb ] \n", "Writing this text to log file [ plink_2.log ]\n", - "Analysis started: Wed Nov 4 14:50:58 2015\n", + "Analysis started: Fri Jan 20 09:21:10 2017\n", "\n", "Options in effect:\n", "\t--noweb\n", @@ -1314,7 +1314,7 @@ "10 founders and 0 non-founders found\n", "Writing allele frequencies (founders-only) to [ plink_2.frq ] \n", "\n", - "Analysis finished: Wed Nov 4 14:50:58 2015\n", + "Analysis finished: Fri Jan 20 09:21:10 2017\n", "\n" ] } @@ -1370,7 +1370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.0" + "version": "3.5.2" } }, "nbformat": 4, From 2f2e9a019f00aee4037f01387b9551c05af69524 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:45:21 -0500 Subject: [PATCH 06/15] Now check if BED might be corrupted by looking last entry --- pyplink/pyplink.py | 16 +++++++++++++--- pyplink/tests/test_pyplink.py | 26 ++++++++++++-------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 7a37b5f..40ae267 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -285,7 +285,11 @@ def _read_bed(self): if (self._bim is None) or (self._fam is None): raise RuntimeError("no BIM or FAM file were read") - # Checking the file is valid by looking at the first 3 bytes + # The number of bytes per marker + self._nb_bytes = int(np.ceil(self._nb_samples / 4.0)) + + # Checking the file is valid by looking at the first 3 bytes and the + # last entry (correct size) with open(self.bed_filename, "rb") as bed_file: # Checking that the first two bytes are OK if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27): @@ -297,8 +301,14 @@ def _read_bed(self): raise ValueError("not in SNP-major format (please recode): " "{}".format(self.bed_filename)) - # The number of bytes per marker - self._nb_bytes = int(np.ceil(self._nb_samples / 4.0)) + # Checking the last entry + seek_position = self._get_seek_position(self._bim.iloc[-1, :].i) + bed_file.seek(seek_position) + geno = self._geno_values[ + np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8) + ].flatten(order="C")[:self._nb_samples] + if geno.shape[0] != self._nb_samples: + raise ValueError("invalid number of entries: corrupted BED?") # Opening the file for the rest of the operations (reading 3 bytes) self._bed = open(self.bed_filename, "rb") diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index 2393890..4ee0ea6 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -231,22 +231,20 @@ def test_pyplink_bad_bed(self): o_file.write(i_file.read()) # Copying the BIM file - new_fam = new_prefix + ".bim" - with open(new_fam, "w") as o_file, open(self.fam, "r") as i_file: + new_bim = new_prefix + ".bim" + with open(new_bim, "w") as o_file, open(self.bim, "r") as i_file: o_file.write(i_file.read()) - # TODO: Change to parse the actual file completely (since we no longer - # load to memory -# # Creating a new BED file with invalid number of bytes -# new_bed = new_prefix + ".bed" -# with open(new_bed, "wb") as o_file: -# o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4])) - -# # This should raise an exception -# with self.assertRaises(ValueError) as cm: -# PyPlink(new_prefix) -# self.assertEqual("invalid number of entries: {}".format(new_bed), -# str(cm.exception)) + # Creating a new BED file with invalid number of bytes + new_bed = new_prefix + ".bed" + with open(new_bed, "wb") as o_file: + o_file.write(bytearray([108, 27, 1, 1, 2, 3, 4])) + + # This should raise an exception + with self.assertRaises(ValueError) as cm: + PyPlink(new_prefix) + self.assertEqual("invalid number of entries: corrupted BED?", + str(cm.exception)) # Creating a new BED file with invalid first byte new_bed = new_prefix + ".bed" From 3938b8218137524036382f2bbf8f97dcec19ec9f Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:54:08 -0500 Subject: [PATCH 07/15] Simplified the 'iter_acgt_geno' function so that it uses 'iter_geno' directly. --- pyplink/pyplink.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 40ae267..ad6c1ea 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -333,14 +333,8 @@ def iter_geno(self): def iter_acgt_geno(self): """Iterates over genotypes (ACGT format).""" - if self._mode != "r": - raise UnsupportedOperation("not available in 'w' mode") - - # Seeking back at the beginning of the file - self.seek(0) - # Need to iterate over itself, and modify the actual genotypes - for i, (marker, geno) in enumerate(self): + for i, (marker, geno) in enumerate(self.iter_geno()): yield marker, self._allele_encoding[i][geno] def iter_geno_marker(self, markers): From 2d003fc5c8e843baf0b2e219f0d77da322c3c75c Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:54:35 -0500 Subject: [PATCH 08/15] Reordered functions --- pyplink/pyplink.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index ad6c1ea..5b0fcff 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -364,21 +364,6 @@ def iter_geno_marker(self, markers): # Getting the value yield snp, self._read_current_marker() - def get_geno_marker(self, marker): - """Gets the genotypes for a given marker.""" - if self._mode != "r": - raise UnsupportedOperation("not available in 'w' mode") - - # Check if the marker exists - if marker not in set(self._bim.index): - raise ValueError("{}: marker not in BIM".format(marker)) - - # Seeking to the correct position - seek_position = self._bim.loc[marker, "i"] - self.seek(seek_position) - - return self._read_current_marker() - def iter_acgt_geno_marker(self, markers): """Iterates over genotypes for given markers (ACGT format).""" if self._mode != "r": @@ -407,6 +392,21 @@ def iter_acgt_geno_marker(self, markers): geno = self._read_current_marker() yield snp, self._allele_encoding[seek_position][geno] + def get_geno_marker(self, marker): + """Gets the genotypes for a given marker.""" + if self._mode != "r": + raise UnsupportedOperation("not available in 'w' mode") + + # Check if the marker exists + if marker not in set(self._bim.index): + raise ValueError("{}: marker not in BIM".format(marker)) + + # Seeking to the correct position + seek_position = self._bim.loc[marker, "i"] + self.seek(seek_position) + + return self._read_current_marker() + def get_acgt_geno_marker(self, marker): """Gets the genotypes for a given marker (ACGT format).""" if self._mode != "r": From bf047ac966fcc15b595031b832eff9c0d8cdee18 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 09:57:57 -0500 Subject: [PATCH 09/15] Simplified the 'iter_acgt_geno_marker' function so that it uses 'iter_geno_marker' directly. --- pyplink/pyplink.py | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 5b0fcff..79c6c06 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -366,31 +366,11 @@ def iter_geno_marker(self, markers): def iter_acgt_geno_marker(self, markers): """Iterates over genotypes for given markers (ACGT format).""" - if self._mode != "r": - raise UnsupportedOperation("not available in 'w' mode") - - # If string, we change to list - if isinstance(markers, str): - markers = [markers] - - # Checking the list of required markers - unknown_markers = set(markers) - set(self._bim.index) - if len(unknown_markers) > 0: - raise ValueError("{}: marker not in BIM".format( - sorted(unknown_markers) - )) - - # Getting the required markers - required_markers = self._bim.loc[markers, :] - - # Then, we iterate - for snp, seek_position in required_markers.i.iteritems(): - # Seeking to the correct position - self.seek(seek_position) - - # Getting the genotype and converting to ACGT - geno = self._read_current_marker() - yield snp, self._allele_encoding[seek_position][geno] + # We iterate over the markers + for snp, geno in self.iter_geno_marker(markers): + # Getting the SNP position and converting to ACGT + snp_position = self._bim.loc[snp, "i"] + yield snp, self._allele_encoding[snp_position][geno] def get_geno_marker(self, marker): """Gets the genotypes for a given marker.""" From 42024fba7c64f8b571afb7d1f7b96087fbb5435d Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 10:02:27 -0500 Subject: [PATCH 10/15] Simplified the 'get_acgt_geno_marker' function so that it uses 'get_geno_marker' directly. --- pyplink/pyplink.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 79c6c06..3ca6fcc 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -389,21 +389,14 @@ def get_geno_marker(self, marker): def get_acgt_geno_marker(self, marker): """Gets the genotypes for a given marker (ACGT format).""" - if self._mode != "r": - raise UnsupportedOperation("not available in 'w' mode") - - # Check if the marker exists - if marker not in set(self._bim.index): - raise ValueError("{}: marker not in BIM".format(marker)) - - # Seeking to the correct position - seek_position = self._bim.loc[marker, "i"] - self.seek(seek_position) + # Getting the marker's genotypes + geno = self.get_geno_marker(marker) - # Getting the genotypes and converting them to ACGT - geno = self._read_current_marker() + # Getting the marker's position + snp_position = self._bim.loc[marker, "i"] - return self._allele_encoding[seek_position][geno] + # Returning the ACGT's format of the genotypes + return self._allele_encoding[snp_position][geno] def write_marker(self, genotypes): """Deprecated function.""" From 271a6a3f061e8c76a5ef40e337085f0390511e88 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 10:05:53 -0500 Subject: [PATCH 11/15] Made it so that the we doesn't have to relook at the SNP position when converting to ACGT format --- pyplink/pyplink.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 3ca6fcc..9b03211 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -361,6 +361,9 @@ def iter_geno_marker(self, markers): # Seeking to the correct position self.seek(seek_position) + # Setting the current SNP position + self._curr_snp_position = seek_position + # Getting the value yield snp, self._read_current_marker() @@ -369,8 +372,7 @@ def iter_acgt_geno_marker(self, markers): # We iterate over the markers for snp, geno in self.iter_geno_marker(markers): # Getting the SNP position and converting to ACGT - snp_position = self._bim.loc[snp, "i"] - yield snp, self._allele_encoding[snp_position][geno] + yield snp, self._allele_encoding[self._curr_snp_position][geno] def get_geno_marker(self, marker): """Gets the genotypes for a given marker.""" @@ -385,6 +387,9 @@ def get_geno_marker(self, marker): seek_position = self._bim.loc[marker, "i"] self.seek(seek_position) + # Setting the current SNP position + self._curr_snp_position = seek_position + return self._read_current_marker() def get_acgt_geno_marker(self, marker): @@ -392,11 +397,8 @@ def get_acgt_geno_marker(self, marker): # Getting the marker's genotypes geno = self.get_geno_marker(marker) - # Getting the marker's position - snp_position = self._bim.loc[marker, "i"] - # Returning the ACGT's format of the genotypes - return self._allele_encoding[snp_position][geno] + return self._allele_encoding[self._curr_snp_position][geno] def write_marker(self, genotypes): """Deprecated function.""" From 5556500f3a64d7d98baedced645eb0b7661ea169 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 11:06:01 -0500 Subject: [PATCH 12/15] Added parameters to get seek (snp) position required by ACGT genotypes conversion. --- pyplink/pyplink.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 9b03211..d5cefc0 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -337,7 +337,7 @@ def iter_acgt_geno(self): for i, (marker, geno) in enumerate(self.iter_geno()): yield marker, self._allele_encoding[i][geno] - def iter_geno_marker(self, markers): + def iter_geno_marker(self, markers, return_seek=False): """Iterates over genotypes for given markers.""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -361,20 +361,20 @@ def iter_geno_marker(self, markers): # Seeking to the correct position self.seek(seek_position) - # Setting the current SNP position - self._curr_snp_position = seek_position - # Getting the value - yield snp, self._read_current_marker() + if return_seek: + yield snp, self._read_current_marker(), seek_position + else: + yield snp, self._read_current_marker() def iter_acgt_geno_marker(self, markers): """Iterates over genotypes for given markers (ACGT format).""" # We iterate over the markers - for snp, geno in self.iter_geno_marker(markers): + for snp, geno, s in self.iter_geno_marker(markers, return_seek=True): # Getting the SNP position and converting to ACGT - yield snp, self._allele_encoding[self._curr_snp_position][geno] + yield snp, self._allele_encoding[s][geno] - def get_geno_marker(self, marker): + def get_geno_marker(self, marker, return_seek=False): """Gets the genotypes for a given marker.""" if self._mode != "r": raise UnsupportedOperation("not available in 'w' mode") @@ -387,18 +387,17 @@ def get_geno_marker(self, marker): seek_position = self._bim.loc[marker, "i"] self.seek(seek_position) - # Setting the current SNP position - self._curr_snp_position = seek_position - + if return_seek: + return self._read_current_marker(), seek_position return self._read_current_marker() def get_acgt_geno_marker(self, marker): """Gets the genotypes for a given marker (ACGT format).""" # Getting the marker's genotypes - geno = self.get_geno_marker(marker) + geno, snp_position = self.get_geno_marker(marker, return_seek=True) # Returning the ACGT's format of the genotypes - return self._allele_encoding[self._curr_snp_position][geno] + return self._allele_encoding[snp_position][geno] def write_marker(self, genotypes): """Deprecated function.""" From 1f541f3fc0f1dc89485eaf4020bb5b3b95740440 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 11:14:28 -0500 Subject: [PATCH 13/15] Updated the README --- README.mkd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.mkd b/README.mkd index b23901e..d9d0e65 100644 --- a/README.mkd +++ b/README.mkd @@ -10,8 +10,8 @@ ## Dependencies -The tool requires a standard [Python](http://python.org/) installation (2.7 or -3.4) with the following modules: +The tool requires a standard [Python](http://python.org/) installation (2.7 and +3.3 or higher are supported) with the following modules: 1. [numpy](http://www.numpy.org/) version 1.8.2 or latest 2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest @@ -70,9 +70,9 @@ To test the module, just perform the following command: ```python >>> import pyplink >>> pyplink.test() -............................................ +............................................. ---------------------------------------------------------------------- -Ran 44 tests in 0.468s +Ran 45 tests in 0.480s OK ``` From 01acecc14da137c5b1aa416d98eb1418c8cb8189 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 12:34:26 -0500 Subject: [PATCH 14/15] Updated setup file for next version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 10f5ac7..67c6c82 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ MAJOR = 1 -MINOR = 2 +MINOR = 3 MICRO = 0 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO) From 1837abea6873b6745bcd6b973cf3d1ede1b9e3d4 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Fri, 20 Jan 2017 12:34:40 -0500 Subject: [PATCH 15/15] Updated script for building conda package --- conda_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda_build.sh b/conda_build.sh index a4033be..8f04ac9 100755 --- a/conda_build.sh +++ b/conda_build.sh @@ -8,8 +8,8 @@ pushd skeleton conda skeleton pypi pyplink # The different python versions and platforms -python_versions="2.7 3.3 3.4 3.5" -platforms="linux-32 linux-64 osx-64 win-32 win-64" +python_versions="2.7 3.3 3.4 3.5 3.6" +platforms="all" # Building for python_version in $python_versions