Skip to content

Commit

Permalink
Release 1.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
lemieuxl committed Jan 20, 2017
2 parents 2648dc3 + 1837abe commit b61ce30
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 148 deletions.
4 changes: 4 additions & 0 deletions .travis.yml
Expand Up @@ -4,7 +4,11 @@ python:
- "3.3"
- "3.4"
- "3.5"
- "3.6"
install:
- pip install six
- pip install numpy
- pip install pandas
- pip install coveralls
script:
- coverage run setup.py test
Expand Down
8 changes: 4 additions & 4 deletions README.mkd
Expand Up @@ -10,8 +10,8 @@

## Dependencies

The tool requires a standard [Python](http://python.org/) installation (2.7 or
3.4) with the following modules:
The tool requires a standard [Python](http://python.org/) installation (2.7 and
3.3 or higher are supported) with the following modules:

1. [numpy](http://www.numpy.org/) version 1.8.2 or latest
2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest
Expand Down Expand Up @@ -70,9 +70,9 @@ To test the module, just perform the following command:
```python
>>> import pyplink
>>> pyplink.test()
............................................
.............................................
----------------------------------------------------------------------
Ran 44 tests in 0.468s
Ran 45 tests in 0.480s

OK
```
Expand Down
4 changes: 2 additions & 2 deletions conda_build.sh
Expand Up @@ -8,8 +8,8 @@ pushd skeleton
conda skeleton pypi pyplink

# The different python versions and platforms
python_versions="2.7 3.3 3.4 3.5"
platforms="linux-32 linux-64 osx-64 win-32 win-64"
python_versions="2.7 3.3 3.4 3.5 3.6"
platforms="all"

# Building
for python_version in $python_versions
Expand Down
10 changes: 5 additions & 5 deletions demo/PyPlink Demo.ipynb
Expand Up @@ -1146,7 +1146,7 @@
"\n",
"Skipping web check... [ --noweb ] \n",
"Writing this text to log file [ plink.log ]\n",
"Analysis started: Wed Nov 4 14:50:58 2015\n",
"Analysis started: Fri Jan 20 09:20:55 2017\n",
"\n",
"Options in effect:\n",
"\t--noweb\n",
Expand All @@ -1171,7 +1171,7 @@
"10 founders and 0 non-founders found\n",
"Writing allele frequencies (founders-only) to [ plink.frq ] \n",
"\n",
"Analysis finished: Wed Nov 4 14:50:58 2015\n",
"Analysis finished: Fri Jan 20 09:20:55 2017\n",
"\n"
]
}
Expand Down Expand Up @@ -1287,7 +1287,7 @@
"\n",
"Skipping web check... [ --noweb ] \n",
"Writing this text to log file [ plink_2.log ]\n",
"Analysis started: Wed Nov 4 14:50:58 2015\n",
"Analysis started: Fri Jan 20 09:21:10 2017\n",
"\n",
"Options in effect:\n",
"\t--noweb\n",
Expand All @@ -1314,7 +1314,7 @@
"10 founders and 0 non-founders found\n",
"Writing allele frequencies (founders-only) to [ plink_2.frq ] \n",
"\n",
"Analysis finished: Wed Nov 4 14:50:58 2015\n",
"Analysis finished: Fri Jan 20 09:21:10 2017\n",
"\n"
]
}
Expand Down Expand Up @@ -1370,7 +1370,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
"version": "3.5.2"
}
},
"nbformat": 4,
Expand Down
163 changes: 77 additions & 86 deletions pyplink/pyplink.py
Expand Up @@ -125,7 +125,7 @@ def __init__(self, i_prefix, mode="r", bed_format="SNP-major"):
self._nb_values = None

# Opening the output BED file
self._bed_file = open(self.bed_filename, "wb")
self._bed = open(self.bed_filename, "wb")
self._write_bed_header()

else:
Expand Down Expand Up @@ -162,36 +162,43 @@ def __exit__(self, *args):

def close(self):
"""Closes the BED file if required."""
if self._mode == "w":
# Closing the BED file
self._bed_file.close()
# Closing the BED file
self._bed.close()

def next(self):
"""The next function."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

if self._n < self._nb_markers:
self._n += 1

# We want to return information about the marker and the genotypes
geno = self._geno_values[self._bed[self._n - 1]].flatten(order="C")
return self._markers[self._n - 1], geno[:self._nb_samples]
else:
self._n += 1
if self._n > self._nb_markers:
raise StopIteration()

return self._markers[self._n - 1], self._read_current_marker()

def _read_current_marker(self):
"""Reads the current marker and returns its genotypes."""
return self._geno_values[
np.fromstring(self._bed.read(self._nb_bytes), dtype=np.uint8)
].flatten(order="C")[:self._nb_samples]

def seek(self, n):
"""Gets to a certain position in the BED file when iterating."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

if 0 <= n < len(self._bed):
if 0 <= n < self._nb_markers:
self._n = n
self._bed.seek(self._get_seek_position(n))

else:
# Invalid seek value
raise ValueError("invalid position in BED: {}".format(n))

def _get_seek_position(self, n):
"""Gets the seek position in the file (including special bytes)."""
return 3 + self._nb_bytes * n

def _read_bim(self):
"""Reads the BIM file."""
# Reading the BIM file and setting the values
Expand All @@ -204,7 +211,7 @@ def _read_bim(self):
bim["a2"] = bim["a2"].astype(str)

bim = bim.set_index("snp")
bim["i"] = range(len(bim))
bim["i"] = range(bim.shape[0])
bim[2] = bim.a1 * 2 # Original '0'
bim[1] = bim.a1 + bim.a2 # Original '2'
bim[0] = bim.a2 * 2 # Original '3'
Expand All @@ -219,7 +226,7 @@ def _read_bim(self):

# Saving the data in the object
self._bim = bim[["chrom", "pos", "cm", "a1", "a2", "i"]]
self._nb_markers = len(self._bim)
self._nb_markers = self._bim.shape[0]
self._markers = np.array(list(self._bim.index))

def get_bim(self):
Expand Down Expand Up @@ -256,7 +263,7 @@ def _read_fam(self):

# Saving the data in the object
self._fam = fam
self._nb_samples = len(self._fam)
self._nb_samples = self._fam.shape[0]

def get_fam(self):
"""Returns the FAM file."""
Expand All @@ -278,7 +285,11 @@ def _read_bed(self):
if (self._bim is None) or (self._fam is None):
raise RuntimeError("no BIM or FAM file were read")

data = None
# The number of bytes per marker
self._nb_bytes = int(np.ceil(self._nb_samples / 4.0))

# Checking the file is valid by looking at the first 3 bytes and the
# last entry (correct size)
with open(self.bed_filename, "rb") as bed_file:
# Checking that the first two bytes are OK
if (ord(bed_file.read(1)) != 108) or (ord(bed_file.read(1)) != 27):
Expand All @@ -290,47 +301,43 @@ def _read_bed(self):
raise ValueError("not in SNP-major format (please recode): "
"{}".format(self.bed_filename))

# The number of bytes per marker
nb_bytes = int(np.ceil(self._nb_samples / 4.0))
# Checking the last entry
seek_position = self._get_seek_position(self._bim.iloc[-1, :].i)
bed_file.seek(seek_position)
geno = self._geno_values[
np.fromstring(bed_file.read(self._nb_bytes), dtype=np.uint8)
].flatten(order="C")[:self._nb_samples]
if geno.shape[0] != self._nb_samples:
raise ValueError("invalid number of entries: corrupted BED?")

# Reading the data
data = np.fromfile(bed_file, dtype=np.uint8)

# Checking the data
if data.shape[0] != (self._nb_markers * nb_bytes):
raise ValueError("invalid number of entries: "
"{}".format(self.bed_filename))
data.shape = (self._nb_markers, nb_bytes)

# Saving the data in the object
self._bed = data
# Opening the file for the rest of the operations (reading 3 bytes)
self._bed = open(self.bed_filename, "rb")
self._bed.read(3)

def _write_bed_header(self):
"""Writes the BED first 3 bytes."""
# Writing the first three bytes
final_byte = 1 if self._bed_format == "SNP-major" else 0
self._bed_file.write(bytearray((108, 27, final_byte)))
self._bed.write(bytearray((108, 27, final_byte)))

def iter_geno(self):
"""Iterates over genotypes."""
"""Iterates over genotypes from the beginning."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

for i in range(len(self._bed)):
geno = self._geno_values[self._bed[i]].flatten(order="C")
yield self._markers[i], geno[:self._nb_samples]
# Seeking back at the beginning of the file
self.seek(0)

# Return itself (the generator)
return self

def iter_acgt_geno(self):
"""Iterates over genotypes (ACGT format)."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

for i in range(len(self._bed)):
geno = self._geno_values[self._bed[i]].flatten(order="C")
yield (self._markers[i],
self._allele_encoding[i][geno[:self._nb_samples]])
# Need to iterate over itself, and modify the actual genotypes
for i, (marker, geno) in enumerate(self.iter_geno()):
yield marker, self._allele_encoding[i][geno]

def iter_geno_marker(self, markers):
def iter_geno_marker(self, markers, return_seek=False):
"""Iterates over genotypes for given markers."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")
Expand All @@ -350,11 +357,24 @@ def iter_geno_marker(self, markers):
required_markers = self._bim.loc[markers, :]

# Then, we iterate
for snp, i in required_markers.i.iteritems():
geno = self._geno_values[self._bed[i]].flatten(order="C")
yield snp, geno[:self._nb_samples]
for snp, seek_position in required_markers.i.iteritems():
# Seeking to the correct position
self.seek(seek_position)

# Getting the value
if return_seek:
yield snp, self._read_current_marker(), seek_position
else:
yield snp, self._read_current_marker()

def iter_acgt_geno_marker(self, markers):
"""Iterates over genotypes for given markers (ACGT format)."""
# We iterate over the markers
for snp, geno, s in self.iter_geno_marker(markers, return_seek=True):
# Getting the SNP position and converting to ACGT
yield snp, self._allele_encoding[s][geno]

def get_geno_marker(self, marker):
def get_geno_marker(self, marker, return_seek=False):
"""Gets the genotypes for a given marker."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")
Expand All @@ -363,50 +383,21 @@ def get_geno_marker(self, marker):
if marker not in set(self._bim.index):
raise ValueError("{}: marker not in BIM".format(marker))

# Getting all the genotypes
i = self._bim.loc[marker, "i"]
geno = self._geno_values[self._bed[i]].flatten(order="C")

return geno[:self._nb_samples]
# Seeking to the correct position
seek_position = self._bim.loc[marker, "i"]
self.seek(seek_position)

def iter_acgt_geno_marker(self, markers):
"""Iterates over genotypes for given markers (ACGT format)."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

# If string, we change to list
if isinstance(markers, str):
markers = [markers]

# Checking the list of required markers
unknown_markers = set(markers) - set(self._bim.index)
if len(unknown_markers) > 0:
raise ValueError("{}: marker not in BIM".format(
sorted(unknown_markers)
))

# Getting the required markers
required_markers = self._bim.loc[markers, :]

# Then, we iterate
for snp, i in required_markers.i.iteritems():
geno = self._geno_values[self._bed[i]].flatten(order="C")
yield snp, self._allele_encoding[i][geno[:self._nb_samples]]
if return_seek:
return self._read_current_marker(), seek_position
return self._read_current_marker()

def get_acgt_geno_marker(self, marker):
"""Gets the genotypes for a given marker (ACGT format)."""
if self._mode != "r":
raise UnsupportedOperation("not available in 'w' mode")

# Check if the marker exists
if marker not in set(self._bim.index):
raise ValueError("{}: marker not in BIM".format(marker))

# Getting all the genotypes
i = self._bim.loc[marker, "i"]
geno = self._geno_values[self._bed[i]].flatten(order="C")
# Getting the marker's genotypes
geno, snp_position = self.get_geno_marker(marker, return_seek=True)

return self._allele_encoding[i][geno[:self._nb_samples]]
# Returning the ACGT's format of the genotypes
return self._allele_encoding[snp_position][geno]

def write_marker(self, genotypes):
"""Deprecated function."""
Expand Down Expand Up @@ -434,7 +425,7 @@ def write_genotypes(self, genotypes):
g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in
self._grouper((_byte_recode[geno] for geno in genotypes), 4)
]
self._bed_file.write(bytearray(byte_array))
self._bed.write(bytearray(byte_array))

@staticmethod
def _grouper(iterable, n, fillvalue=0):
Expand Down

0 comments on commit b61ce30

Please sign in to comment.