From 45731d66211ca2b05ee58c10bb3b34aa9731671f Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 15 Sep 2015 15:00:11 -0400
Subject: [PATCH 01/13] Added a script to build for conda (and all relevant
 python2 and 3 versions)

---
 conda_build.sh | 34 ++++++++++++++++++++++++++++++++++
 setup.py       |  4 +---
 2 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100755 conda_build.sh

diff --git a/conda_build.sh b/conda_build.sh
new file mode 100755
index 0000000..bf5d9c3
--- /dev/null
+++ b/conda_build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# Creating a directory for the skeleton
+mkdir -p skeleton
+pushd skeleton
+
+# Creating the skeleton
+conda skeleton pypi pyplink
+
+# The different python versions and platforms
+python_versions="2.7 3.3 3.4 3.5"
+platforms="linux-32 linux-64 osx-64 win-32 win-64"
+
+# Building
+for python_version in $python_versions
+do
+    # Building
+    conda build --python $python_version pyplink &> log.txt
+    filename=$(egrep "^# [$] binstar upload \S+$" log.txt | cut -d " " -f 5)
+
+    # Converting
+    for platform in $platforms
+    do
+        conda convert -p $platform $filename -o ../conda_dist
+    done
+done
+
+popd
+rm -rf skeleton
+
+# Indexing
+pushd conda_dist
+conda index *
+popd
diff --git a/setup.py b/setup.py
index 4455fd5..b32ea48 100644
--- a/setup.py
+++ b/setup.py
@@ -7,9 +7,7 @@
 #   - python setup.py bdist_wheel --universal
 
 # How to build for conda (do both with 2.7 and 3.4)
-#   - python setup.py bdist_conda
-#   - conda convert -p all /PATH/TO/FILE -o conda_dist
-#   - cd conda_dist && conda index *
+#   - bash conda_build.sh
 
 
 import os

From 6ebd520604b90bac4504a2e42092ba7368feee1e Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 15 Sep 2015 15:08:04 -0400
Subject: [PATCH 02/13] Added testing for python 3.5

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index e335212..6da901f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,4 +3,5 @@ python:
   - "2.7"
   - "3.3"
   - "3.4"
+  - "3.5"
 script: "python setup.py test"

From c5dabcf8bb1f1c151d62531e2c8dfbe3d6585a51 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Mon, 26 Oct 2015 16:50:19 -0400
Subject: [PATCH 03/13] First go

---
 pyplink/pyplink.py            | 115 +++++++++++++++++++++++++++-------
 pyplink/tests/test_pyplink.py |  17 +++--
 2 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 16acf5b..2277f60 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -27,6 +27,11 @@
 
 import os
 
+try:
+    from itertools import zip_longest as zip_longest
+except ImportError:
+    from itertools import izip_longest as zip_longest
+
 import numpy as np
 import pandas as pd
 
@@ -43,9 +48,10 @@
 
 # The recoding values
 _geno_recode = {1: -1,  # Unknown genotype
-                2: 1,   # Heterozygous genotype
-                0: 2,   # Homozygous A1
-                3: 0}   # Homozygous A2
+                2:  1,  # Heterozygous genotype
+                0:  2,  # Homozygous A1
+                3:  0}  # Homozygous A2
+_byte_recode = dict(value[::-1] for value in _geno_recode.items())
 
 
 class PyPlink(object):
@@ -60,40 +66,58 @@ class PyPlink(object):
         dtype=np.int8,
     )
 
-    def __init__(self, i_prefix):
+    def __init__(self, i_prefix, mode="r"):
         """Initializes a new PyPlink object.
 
-        :param i_prefix: the prefix of the binary Plink files
-        :type i_prefix: str
+        Args:
+            i_prefix (str): the prefix of the binary Plink files
+            mode (str): the open mode for the binary Plink file
+            nb_samples (int): the number of samples
 
-        Reads binary Plink files (BED, BIM and FAM) and create the objects.
+        Reads or write binary Plink files (BED, BIM and FAM).
 
         """
-        # These are the name of the input files
+        # The mode
+        self._mode = mode
+
+        # These are the name of the files
         self.bed_filename = "{}.bed".format(i_prefix)
         self.bim_filename = "{}.bim".format(i_prefix)
         self.fam_filename = "{}.fam".format(i_prefix)
 
-        # Checking that all the files exists (otherwise, there's nothing to do)
-        for filename in (self.bed_filename, self.bim_filename,
-                         self.fam_filename):
-            if not os.path.isfile(filename):
-                raise IOError("No such file: '{}'".format(filename))
+        if self._mode == "r":
+            # Checking that all the files exists (otherwise, there's nothing to
+            # do)
+            for filename in (self.bed_filename, self.bim_filename,
+                             self.fam_filename):
+                if not os.path.isfile(filename):
+                    raise IOError("No such file: '{}'".format(filename))
+
+            # Setting BIM and FAM to None
+            self._bim = None
+            self._fam = None
+
+            # Reading the input files
+            self._read_bim()
+            self._read_fam()
+            self._read_bed()
 
-        # Setting BIM and FAM to None
-        self._bim = None
-        self._fam = None
+            # Where we're at
+            self._n = 0
 
-        # Reading the input files
-        self._read_bim()
-        self._read_fam()
-        self._read_bed()
+        elif self._mode == "w":
+            # The dummy number of samples and bytes
+            self._nb_samples = None
 
-        # Where we're at
-        self._n = 0
+            # Opening the output BED file
+            self._bed_file = open(self.bed_filename, "wb")
+            self._write_bed_header()
+
+        else:
+            raise ValueError("invalid mode: '{}'".format(self._mode))
 
     def __repr__(self):
-        """The __repr__ function."""
+        """The representation of the PyPlink object."""
         return "PyPlink({:,d} samples; {:,d} markers)".format(
             self.get_nb_samples(),
             self.get_nb_markers(),
@@ -107,6 +131,16 @@ def __next__(self):
         """The __next__ function."""
         return self.next()
 
+    def __enter__(self):
+        """Entering the context manager."""
+        return self
+
+    def __exit__(self, *args):
+        """Exiting the context manager."""
+        if self._mode == "w":
+            # Closing the BED file
+            self._bed_file.close()
+
     def next(self):
         """The next function."""
         if self._n < self._nb_markers:
@@ -136,8 +170,10 @@ def _read_bim(self):
         bim = pd.read_csv(self.bim_filename, sep="\t",
                           names=original_bim_cols)
 
-        # The 'snp' should always be strings
+        # The 'snp', 'a1' and 'a2' columns should always be strings
         bim["snp"] = bim["snp"].astype(str)
+        bim["a1"] = bim["a1"].astype(str)
+        bim["a2"] = bim["a2"].astype(str)
 
         bim = bim.set_index("snp", drop=False)
         bim["i"] = range(len(bim))
@@ -229,6 +265,11 @@ def _read_bed(self):
         # Saving the data in the object
         self._bed = data
 
+    def _write_bed_header(self):
+        """Writes the BED first 3 bytes."""
+        # Writing the first three bytes
+        self._bed_file.write(bytearray((108, 27, 1)))
+
     def iter_geno(self):
         """Iterates over genotypes."""
         for i in range(len(self._bed)):
@@ -307,3 +348,29 @@ def get_acgt_geno_marker(self, marker):
         geno = self._geno_values[self._bed[i]].flatten(order="C")
 
         return self._allele_encoding[i][geno[:self._nb_samples]]
+
+    def write_marker(self, genotypes):
+        """Write genotypes to binary file."""
+        # Initializing the number of samples if required
+        if self._nb_samples is None:
+            self._nb_samples = len(genotypes)
+
+        # Checking the expected number of samples
+        if self._nb_samples != len(genotypes):
+            raise ValueError("{:,d} samples expected, got {:,d}".format(
+                self._nb_samples,
+                len(genotypes),
+            ))
+
+        # Writing to file
+        byte_array = [
+            g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in
+            self._grouper((_byte_recode[geno] for geno in genotypes), 4)
+        ]
+        self._bed_file.write(bytearray(byte_array))
+
+    @staticmethod
+    def _grouper(iterable, n, fillvalue=0):
+        """Collect data into fixed-length chunks or blocks."""
+        args = [iter(iterable)] * n
+        return zip_longest(fillvalue=fillvalue, *args)
diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index d7a2842..270b2b2 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -519,7 +519,16 @@ def test_get_acgt_geno_marker(self):
         # Asking for an unknown marker should raise an ValueError
         with self.assertRaises(ValueError) as cm:
             self.pedfile.get_acgt_geno_marker("dummy_marker")
-        self.assertEqual(
-            "dummy_marker: marker not in BIM",
-            str(cm.exception),
-        )
+        self.assertEqual("dummy_marker: marker not in BIM", str(cm.exception))
+
+    def test_get_context_read_mode(self):
+        """Tests the PyPlink object as context manager."""
+        with PyPlink(self.prefix) as genotypes:
+            self.assertEqual(3, len(genotypes.get_fam().head(n=3)))
+
+    def test_invalid_mode(self):
+        """Tests invalid mode when PyPlink as context manager."""
+        with self.assertRaises(ValueError) as cm:
+            with PyPlink(self.prefix, "u") as genotypes:
+                pass
+        self.assertEqual("invalid mode: 'u'", str(cm.exception))

From 07c61c8d7f1f054ad2a67e343a4fd16c58c8be49 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Mon, 2 Nov 2015 10:11:36 -0500
Subject: [PATCH 04/13] Changed for whitespace delimiter for the BIM and FAM
 files

---
 pyplink/pyplink.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 2277f60..5f5332b 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -167,7 +167,7 @@ def _read_bim(self):
         original_bim_cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
 
         # Reading the BIM file and setting the values
-        bim = pd.read_csv(self.bim_filename, sep="\t",
+        bim = pd.read_csv(self.bim_filename, delim_whitespace=True,
                           names=original_bim_cols)
 
         # The 'snp', 'a1' and 'a2' columns should always be strings
@@ -208,7 +208,7 @@ def _read_fam(self):
         self.original_fam_cols = ["fid", "iid", "father", "mother", "gender",
                                   "status"]
         # Reading the FAM file and setting the values
-        fam = pd.read_csv(self.fam_filename, sep=" ",
+        fam = pd.read_csv(self.fam_filename, delim_whitespace=True,
                           names=self.original_fam_cols)
 
         # 'fid' and 'iid' should always be strings (more logical that way)

From caafd2ca444a023c18573ef6f0b2dc5c2b653bee Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Mon, 2 Nov 2015 11:04:20 -0500
Subject: [PATCH 05/13] Added testing for writing plink binary files

---
 pyplink/tests/test_pyplink.py | 81 +++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py
index 270b2b2..978a7ca 100644
--- a/pyplink/tests/test_pyplink.py
+++ b/pyplink/tests/test_pyplink.py
@@ -23,6 +23,8 @@
 # THE SOFTWARE.
 
 
+from __future__ import print_function
+
 import os
 import random
 import shutil
@@ -532,3 +534,82 @@ def test_invalid_mode(self):
             with PyPlink(self.prefix, "u") as genotypes:
                 pass
         self.assertEqual("invalid mode: 'u'", str(cm.exception))
+
+    def test_write_binary(self):
+        """Tests writing a Plink binary file."""
+        # The expected genotypes
+        expected_genotypes = [
+            np.array([0, 0, 0, 1, 0, 1, 2], dtype=int),
+            np.array([0, 0, 0, 0, -1, 0, 1], dtype=int),
+            np.array([0, -1, -1, 2, 0, 0, 0], dtype=int),
+        ]
+
+        # The prefix
+        test_prefix = os.path.join(self.tmp_dir, "test_write")
+
+        # Writing the binary file
+        with PyPlink(test_prefix, "w") as plink:
+            for genotypes in expected_genotypes:
+                plink.write_marker(genotypes)
+
+        # Checking the file exists
+        self.assertTrue(os.path.isfile(test_prefix + ".bed"))
+
+        # Writing the FAM file
+        with open(test_prefix + ".fam", "w") as o_file:
+            for i in range(7):
+                print("f{}".format(i+1), "s{}".format(i+1), "0", "0",
+                      random.choice((1, 2)), "-9", sep=" ", file=o_file)
+
+        # Writing the BIM file
+        with open(test_prefix + ".bim", "w") as o_file:
+            for i in range(3):
+                print(i+1, "m{}".format(i+1), "0", i+1, "T", "A", sep="\t",
+                      file=o_file)
+
+        # Reading the written binary file
+        plink = PyPlink(test_prefix)
+        for i, (marker, genotypes) in enumerate(plink):
+            self.assertEqual("m{}".format(i+1), marker)
+            self.assertTrue((expected_genotypes[i] == genotypes).all())
+
+    def test_write_binary_error(self):
+        """Tests writing a binary file, with different number of sample."""
+        # The expected genotypes
+        expected_genotypes = [
+            np.array([0, 0, 0, 1, 0, 1, 2], dtype=int),
+            np.array([0, 0, 0, 0, -1, 0], dtype=int),
+            np.array([0, -1, -1, 2, 0, 0, 0], dtype=int),
+        ]
+
+        # The prefix
+        test_prefix = os.path.join(self.tmp_dir, "test_write")
+
+        # Writing the binary file
+        with self.assertRaises(ValueError) as cm:
+            with PyPlink(test_prefix, "w") as plink:
+                for genotypes in expected_genotypes:
+                    plink.write_marker(genotypes)
+        self.assertEqual("7 samples expected, got 6", str(cm.exception))
+
+    def test_grouper_padding(self):
+        """Tests the _grouper function (when padding is required)."""
+        expected_chunks = [
+            (0, 1, 2),
+            (3, 4, 5),
+            (6, 7, 8),
+            (9, 0, 0),
+        ]
+        observed_chunks = PyPlink._grouper(range(10), 3)
+        for expected, observed in zip(expected_chunks, observed_chunks):
+            self.assertEqual(expected, observed)
+
+    def test_grouper_no_padding(self):
+        """Tests the _grouper function (when padding is not required)."""
+        expected_chunks = [
+            (0, 1, 2, 3, 4),
+            (5, 6, 7, 8, 9),
+        ]
+        observed_chunks = PyPlink._grouper(range(10), 5)
+        for expected, observed in zip(expected_chunks, observed_chunks):
+            self.assertEqual(expected, observed)

From 8fca74f159d22c573aa9f32470e805d0c906dda4 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Mon, 2 Nov 2015 13:32:42 -0500
Subject: [PATCH 06/13] Minor modifications

---
 pyplink/pyplink.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py
index 5f5332b..be6db36 100644
--- a/pyplink/pyplink.py
+++ b/pyplink/pyplink.py
@@ -163,19 +163,16 @@ def seek(self, n):
 
     def _read_bim(self):
         """Reads the BIM file."""
-        # The original BIM columns
-        original_bim_cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
-
         # Reading the BIM file and setting the values
         bim = pd.read_csv(self.bim_filename, delim_whitespace=True,
-                          names=original_bim_cols)
+                          names=["chrom", "snp", "cm", "pos", "a1", "a2"])
 
         # The 'snp', 'a1' and 'a2' columns should always be strings
         bim["snp"] = bim["snp"].astype(str)
         bim["a1"] = bim["a1"].astype(str)
         bim["a2"] = bim["a2"].astype(str)
 
-        bim = bim.set_index("snp", drop=False)
+        bim = bim.set_index("snp")
         bim["i"] = range(len(bim))
         bim[2] = bim.a1 * 2           # Original '0'
         bim[1] = bim.a1 + bim.a2      # Original '2'
@@ -204,16 +201,16 @@ def get_nb_markers(self):
 
     def _read_fam(self):
         """Reads the FAM file."""
-        # The original FAM columns
-        self.original_fam_cols = ["fid", "iid", "father", "mother", "gender",
-                                  "status"]
         # Reading the FAM file and setting the values
         fam = pd.read_csv(self.fam_filename, delim_whitespace=True,
-                          names=self.original_fam_cols)
+                          names=["fid", "iid", "father", "mother", "gender",
+                                 "status"])
 
-        # 'fid' and 'iid' should always be strings (more logical that way)
+        # The sample IDs should always be strings (more logical that way)
         fam["fid"] = fam["fid"].astype(str)
         fam["iid"] = fam["iid"].astype(str)
+        fam["father"] = fam["father"].astype(str)
+        fam["mother"] = fam["mother"].astype(str)
 
         fam["byte"] = [
             int(np.ceil((1 + 1) / 4.0)) - 1 for i in range(len(fam))

From b82ea4832c0b8c7fc022f46ff8d4aafa26279afe Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 10:22:20 -0500
Subject: [PATCH 07/13] Updated README and demo added

---
 .gitignore              |   2 +
 README.mkd              | 162 +-------
 demo/PyPlink Demo.ipynb | 835 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 852 insertions(+), 147 deletions(-)
 create mode 100644 demo/PyPlink Demo.ipynb

diff --git a/.gitignore b/.gitignore
index 6e3bdf9..290f917 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,6 @@ pyplink/version.py
 .coveragerc
 htmlcov
 
+.ipynb_checkpoints
+
 build
diff --git a/README.mkd b/README.mkd
index 80b6197..963532e 100644
--- a/README.mkd
+++ b/README.mkd
@@ -2,9 +2,9 @@
 [![PyPI version](https://badge.fury.io/py/pyplink.svg)](http://badge.fury.io/py/pyplink)
 
 
-# pyplink - Module to read binary files from Plink
+# pyplink - Module to process Plink's binary files
 
-`PyPlink` is a Python module to read binary Plink files.
+`PyPlink` is a Python module to read and write Plink's binary files.
 
 
 ## Dependencies
@@ -13,7 +13,8 @@ The tool requires a standard [Python](http://python.org/) installation (2.7 or
 3.4) with the following modules:
 
 1. [numpy](http://www.numpy.org/) version 1.8.2 or latest
-1. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest
+2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest
+3. [six](https://pythonhosted.org/six/) version 1.9.0 or latest
 
 The tool has been tested on *Linux* only, but should work on *MacOS* and
 *Windows* operating systems as well.
@@ -34,8 +35,8 @@ conda install pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink
 ```
 
 It is possible to add the channel to conda's configuration, so that the
-`-c http://statgen.org/...` can be omitted to update or install. Only perform
-the following command:
+`-c http://statgen.org/...` can be omitted to update or install the package.
+To add the channel, perform the following command:
 
 ```bash
 conda config --add channels http://statgen.org/wp-content/uploads/Softwares/pyplink
@@ -61,146 +62,6 @@ conda update pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink
 ```
 
 
-## Example
-
-This example describe how to work with the `pyplink` module.
-
-
-### Data description
-
-```python
->>> from pyplink import PyPlink
->>> pedfile = PyPlink("prefix")
->>> pedfile.get_nb_samples()
-10656
->>> pedfile.get_nb_markers()
-141701
->>> samples = pedfile.get_fam()
->>> samples.head()
-        fid       iid  father  mother  gender  status
-0  Sample_1  Sample_1       0       0       2      -9
-1  Sample_2  Sample_2       0       0       1      -9
-2  Sample_3  Sample_3       0       0       2      -9
-3  Sample_4  Sample_4       0       0       1      -9
-4  Sample_5  Sample_5       0       0       2      -9
->>> all_markers = pedfile.get_bim()
->>> all_markers.head()
-            chrom     pos a1 a2
-snp                               
-exm2268640      1  762320  A  G
-exm47           1  865628  A  G
-exm53           1  865665  A  G
-exm55           1  865694  A  G
-exm56           1  865700  A  G
-```
-
-
-### Iterating over all markers
-
-Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is
-unknown, `0` is homozygous (major allele), `1` is heterozygous and `2` is
-homozygous (minor allele).
-
-```python
->>> for marker_id, genotypes in pedfile:
-...     print(marker_id, genotypes)
-...     break
-... 
-exm2268640 [0 0 0 ..., 0 0 0]
->>> for marker_id, genotypes in pedfile.iter_geno():
-...     print(marker_id, genotypes)
-...     break
-... 
-exm2268640 [0 0 0 ..., 0 0 0]
-```
-
-Cycling through genotypes as `A`, `C`, `G`, `T` values.
-
-```python
->>> for marker_id, genotypes in pedfile.iter_acgt_geno():
-...     print(marker_id, genotypes)
-...     break
-... 
-exm2268640 ['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG']
-```
-
-
-### Iterating over selected markers
-
-Cycling through genotypes as `-1`, `0`, `1` and `2` values.
-
-```python
->>> markers = ["exm47", "exm2253575", "exm269"]
->>> for marker_id, genotypes in pedfile.iter_geno_marker(markers):
-...     print(marker_id, genotypes)
-... 
-exm47 [0 0 0 ..., 0 0 0]
-exm2253575 [1 1 1 ..., 0 0 2]
-exm269 [0 0 0 ..., 0 1 0]
-```
-
-Cycling through genotypes as `A`, `C`, `G`, `T` values.
-
-```python
->>> for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):
-...     print(marker_id, genotypes)
-... 
-exm47 ['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG']
-exm2253575 ['GA' 'GA' 'GA' ..., 'AA' 'AA' 'GG']
-exm269 ['GG' 'GG' 'GG' ..., 'GG' 'AG' 'GG']
-```
-
-
-### Getting a single marker
-
-To get the genotypes (as `-1`, `0`, `1` and `2` values) of a single marker:
-```python
->>> pedfile.get_geno_marker("exm47")
-[0 0 0 ..., 0 0 0]
-```
-
-To get the genotypes (as `A`, `C`, `G`, `T` values) of a single marker:
-```python
->>> pedfile.get_acgt_geno_marker("exm47")
-['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG']
-```
-
-
-### Misc example
-
-To get all markers on the Y chromosomes for the males.
-
-```python
->>> y_markers = all_markers[all_markers.chrom == 23].index.values
->>> males = samples.gender == 1
->>> for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):
-...     male_genotypes = genotypes[males.values]
-...     print("{:,d} total genotypes".format(len(genotypes)))
-...     print("{:,d} genotypes for {:,d} "
-...           "males".format(len(male_genotypes), males.sum()))
-...     break
-... 
-10,656 total genotypes
-6,297 genotypes for 6,297 males
-```
-
-To count the minor allele frequency.
-
-```python
->>> from collections import Counter
->>> markers = ["exm47", "exm2253575", "exm269"]
->>> for marker_id, genotypes in pedfile.iter_geno_marker(markers):
-...     geno_counter = Counter(genotypes)
-...     nb_alleles = sum(geno_counter[i] * 2 for i in range(3))
-...     maf = (geno_counter[2] * 2 + geno_counter[1]) / nb_alleles
-...     print(marker_id, maf)
-... 
-exm47 0.0070389488503050214
-exm2253575 0.3461719116956318
-exm269 0.05987799155326138
-```
-
-
 ## Testing
 
 To test the module, just perform the following command:
@@ -208,9 +69,16 @@ To test the module, just perform the following command:
 ```python
 >>> import pyplink
 >>> pyplink.test()
-.................
+.......................
 ----------------------------------------------------------------------
-Ran 17 tests in 0.060s
+Ran 23 tests in 0.149s
 
 OK
 ```
+
+
+## Example
+
+The following
+[notebook](http://nbviewer.ipython.org/github/lemieuxl/pyplink/blob/binary_write/demo/PyPlink%20Demo.ipynb)
+contains a demonstration (for both Python 2 and 3) of the `PyPlink` module.
diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
new file mode 100644
index 0000000..912b188
--- /dev/null
+++ b/demo/PyPlink Demo.ipynb	
@@ -0,0 +1,835 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function, division"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# `PyPlink`\n",
+    "\n",
+    "`PyPlink` is a Python module to read and write binary Plink files. Here are small examples for `PyPlink`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from pyplink import PyPlink"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of contents\n",
+    "\n",
+    "* [**Reading binary pedfile**](#Reading-binary-pedfile)\n",
+    "    * [Getting the demo data](#Getting-the-demo-data)\n",
+    "    * [Reading the binary file](#Reading-the-binary-file)\n",
+    "    * [Getting dataset information](#Getting-dataset-information)\n",
+    "    * [Iterating over all markers](#Iterating-over-all-markers)\n",
+    "        * [*Additive format*](#iterating_over_all_additive)\n",
+    "        * [*Nucleotide format*](#iterating_over_all_nuc)\n",
+    "    * [Iterating over selected markers](#Iterating-over-selected-markers)\n",
+    "        * [*Additive format*](#iterating_over_selected_additive)\n",
+    "        * [*Nucleotide format*](#iterating_over_selected_nuc)\n",
+    "    * [Extracting a single marker](#Extracting-a-single-marker)\n",
+    "        * [*Additive format*](#extracting_additive)\n",
+    "        * [*Nucleotide format*](#extracting_nuc)\n",
+    "    * [Misc example](#Misc-example)\n",
+    "        * [*Extracting a subset of markers and samples*](#Extracting-a-subset-of-markers-and-samples)\n",
+    "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reading binary pedfile\n",
+    "\n",
+    "### Getting the demo data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import zipfile\n",
+    "try:\n",
+    "    from urllib.request import urlretrieve\n",
+    "except ImportError:\n",
+    "    from urllib import urlretrieve\n",
+    "\n",
+    "# Downloading the demo data from Plink webset\n",
+    "urlretrieve(\n",
+    "    \"http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip\",\n",
+    "    \"hapmap_r23a.zip\",\n",
+    ")\n",
+    "\n",
+    "# Extracting the archive content\n",
+    "with zipfile.ZipFile(\"hapmap_r23a.zip\", \"r\") as z:\n",
+    "    z.extractall(\".\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Reading the binary file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "pedfile = PyPlink(\"hapmap_r23a\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Getting dataset information"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "270 samples and 4,098,136 markers\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"{:,d} samples and {:,d} markers\".format(\n",
+    "    pedfile.get_nb_samples(),\n",
+    "    pedfile.get_nb_markers(),\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>fid</th>\n",
+       "      <th>iid</th>\n",
+       "      <th>father</th>\n",
+       "      <th>mother</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>status</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>NA18524</td>\n",
+       "      <td>NA18524</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NA18526</td>\n",
+       "      <td>NA18526</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NA18529</td>\n",
+       "      <td>NA18529</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>NA18532</td>\n",
+       "      <td>NA18532</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>NA18537</td>\n",
+       "      <td>NA18537</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       fid      iid father mother  gender  status\n",
+       "0  NA18524  NA18524      0      0       1      -9\n",
+       "1  NA18526  NA18526      0      0       2      -9\n",
+       "2  NA18529  NA18529      0      0       2      -9\n",
+       "3  NA18532  NA18532      0      0       2      -9\n",
+       "4  NA18537  NA18537      0      0       2      -9"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_samples = pedfile.get_fam()\n",
+    "all_samples.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>chrom</th>\n",
+       "      <th>pos</th>\n",
+       "      <th>cm</th>\n",
+       "      <th>a1</th>\n",
+       "      <th>a2</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>snp</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>rs10399749</th>\n",
+       "      <td>1</td>\n",
+       "      <td>45162</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rs2949420</th>\n",
+       "      <td>1</td>\n",
+       "      <td>45257</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rs4030303</th>\n",
+       "      <td>1</td>\n",
+       "      <td>72434</td>\n",
+       "      <td>0</td>\n",
+       "      <td>A</td>\n",
+       "      <td>G</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rs4030300</th>\n",
+       "      <td>1</td>\n",
+       "      <td>72515</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>rs3855952</th>\n",
+       "      <td>1</td>\n",
+       "      <td>77689</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            chrom    pos  cm a1 a2\n",
+       "snp                               \n",
+       "rs10399749      1  45162   0  0  C\n",
+       "rs2949420       1  45257   0  0  T\n",
+       "rs4030303       1  72434   0  A  G\n",
+       "rs4030300       1  72515   0  0  C\n",
+       "rs3855952       1  77689   0  0  A"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_markers = pedfile.get_bim()\n",
+    "all_markers.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Iterating over all markers\n",
+    "\n",
+    "<a id=\"iterating_over_all_additive\"></a>\n",
+    "#### Additive format\n",
+    "\n",
+    "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs10399749\n",
+      "[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0\n",
+      "  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0 -1\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0  0  0\n",
+      "  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0 -1  0  0  0  0  0  0  0\n",
+      " -1  0 -1  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0 -1  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for marker_id, genotypes in pedfile:\n",
+    "    print(marker_id)\n",
+    "    print(genotypes)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs10399749\n",
+      "[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0\n",
+      "  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0 -1\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0  0  0\n",
+      "  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0 -1  0  0  0  0  0  0  0\n",
+      " -1  0 -1  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0 -1  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for marker_id, genotypes in pedfile.iter_geno():\n",
+    "    print(marker_id)\n",
+    "    print(genotypes)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id=\"iterating_over_all_nuc\"></a>\n",
+    "#### Nucleotide format\n",
+    "\n",
+    "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs10399749\n",
+      "['CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n",
+      " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC']\n"
+     ]
+    }
+   ],
+   "source": [
+    "for marker_id, genotypes in pedfile.iter_acgt_geno():\n",
+    "    print(marker_id)\n",
+    "    print(genotypes)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Iterating over selected markers\n",
+    "\n",
+    "<a id=\"iterating_over_selected_additive\"></a>\n",
+    "#### Additive format\n",
+    "\n",
+    "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs7092431\n",
+      "[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
+      "\n",
+      "rs9943770\n",
+      "[ 0  0  0  2  0  2  0  1  1  0  0  0  0  0  0  0  1  0  0  1  0  0  1  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  1  0  1  0  0\n",
+      "  1  0  1  0  0  1  1  1  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1\n",
+      "  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  0  1  1  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  1  0  0  0  0  0  0  0\n",
+      "  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  1 -1  0  0  0  1  2  1  1  0  1  1  2  0  0  0  1  0  0  0  0  1  1  0  2\n",
+      "  1  1  1  0  1  1 -1  1  0  0  1  0  0  2  0  1  2  0  1  1  1  2  0  1  0\n",
+      "  0  0  0  0  0  0  2  1  2  1  1  2  1  1  1  1  0  1  1  2  1  0 -1  0  1\n",
+      "  1  1  0  1  0  1 -1  2  1  0  0  0  1  1  1  2  0  0  1  1]\n",
+      "\n",
+      "rs1587483\n",
+      "[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  1  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
+      "  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "markers = [\"rs7092431\", \"rs9943770\", \"rs1587483\"]\n",
+    "for marker_id, genotypes in pedfile.iter_geno_marker(markers):\n",
+    "    print(marker_id)\n",
+    "    print(genotypes, end=\"\\n\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id=\"iterating_over_selected_nuc\"></a>\n",
+    "#### Nucleotide format\n",
+    "\n",
+    "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs7092431\n",
+      "['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']\n",
+      "\n",
+      "rs9943770\n",
+      "['AA' 'AA' 'AA' 'GG' 'AA' 'GG' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'GA' 'GA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' '00' 'AA' 'AA' 'AA'\n",
+      " 'GA' 'GG' 'GA' 'GA' 'AA' 'GA' 'GA' 'GG' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'\n",
+      " 'AA' 'GA' 'GA' 'AA' 'GG' 'GA' 'GA' 'GA' 'AA' 'GA' 'GA' '00' 'GA' 'AA' 'AA'\n",
+      " 'GA' 'AA' 'AA' 'GG' 'AA' 'GA' 'GG' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'GA' 'AA'\n",
+      " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GG' 'GA' 'GG' 'GA' 'GA' 'GG' 'GA' 'GA' 'GA'\n",
+      " 'GA' 'AA' 'GA' 'GA' 'GG' 'GA' 'AA' '00' 'AA' 'GA' 'GA' 'GA' 'AA' 'GA' 'AA'\n",
+      " 'GA' '00' 'GG' 'GA' 'AA' 'AA' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'AA' 'GA' 'GA']\n",
+      "\n",
+      "rs1587483\n",
+      "['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'CG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n",
+      " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "markers = [\"rs7092431\", \"rs9943770\", \"rs1587483\"]\n",
+    "for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):\n",
+    "    print(marker_id)\n",
+    "    print(genotypes, end=\"\\n\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Extracting a single marker\n",
+    "\n",
+    "<a id=\"extracting_additive\"></a>\n",
+    "#### Additive format\n",
+    "\n",
+    "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0,  0,  0,  0,  0, -1,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,\n",
+       "        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1,\n",
+       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
+       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,\n",
+       "        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,\n",
+       "        0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
+       "       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int8)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pedfile.get_geno_marker(\"rs7619974\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id=\"extracting_nuc\"></a>\n",
+    "#### Nucleotide format\n",
+    "\n",
+    "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', '00', '00', 'CC',\n",
+       "       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n",
+       "       'CC', 'CC', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n",
+       "       '00', '00', '00', '00', '00', '00'], \n",
+       "      dtype='<U2')"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pedfile.get_acgt_geno_marker(\"rs7619974\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Misc example\n",
+    "\n",
+    "#### Extracting a subset of markers and samples\n",
+    "\n",
+    "To get all markers on the Y chromosomes for the males."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "270 total genotypes\n",
+      "142 genotypes for 142 males (rs1140798 on chr24 and position 169,542)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Getting the Y markers\n",
+    "y_markers = all_markers[all_markers.chrom == 24].index.values\n",
+    "\n",
+    "# Getting the males\n",
+    "males = all_samples.gender == 1\n",
+    "\n",
+    "# Cycling through the Y markers\n",
+    "for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):\n",
+    "    male_genotypes = genotypes[males.values]\n",
+    "    print(\"{:,d} total genotypes\".format(len(genotypes)))\n",
+    "    print(\"{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})\".format(\n",
+    "        len(male_genotypes),\n",
+    "        males.sum(),\n",
+    "        marker_id,\n",
+    "        all_markers.loc[marker_id, \"chrom\"],\n",
+    "        all_markers.loc[marker_id, \"pos\"],\n",
+    "    ))\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Counting the allele frequency of markers\n",
+    "\n",
+    "To count the minor allele frequency of a subset of markers (only for founders)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rs7619974\t0.0\n",
+      "rs2949048\t0.02381\n",
+      "rs16941434\t0.357143\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Getting the founders\n",
+    "founders = (all_samples.father == \"0\") & (all_samples.mother == \"0\")\n",
+    "\n",
+    "# Computing the MAF\n",
+    "markers = [\"rs7619974\", \"rs2949048\", \"rs16941434\"]\n",
+    "for marker_id, genotypes in pedfile.iter_geno_marker(markers):\n",
+    "    valid_genotypes = genotypes[founders.values & (genotypes != -1)]\n",
+    "    maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)\n",
+    "    print(marker_id, round(maf, 6), sep=\"\\t\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

From c9e330bf8de3c10b546479a02fb0126038b894c3 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 11:16:00 -0500
Subject: [PATCH 08/13] Added the write section in the demo

---
 demo/PyPlink Demo.ipynb | 402 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 399 insertions(+), 3 deletions(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index 912b188..8287828 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -52,7 +52,10 @@
     "        * [*Nucleotide format*](#extracting_nuc)\n",
     "    * [Misc example](#Misc-example)\n",
     "        * [*Extracting a subset of markers and samples*](#Extracting-a-subset-of-markers-and-samples)\n",
-    "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)"
+    "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n",
+    "\n",
+    "\n",
+    "* [**Writing binary pedfile**](Writing-binary-pedfile)"
    ]
   },
   {
@@ -61,7 +64,9 @@
    "source": [
     "## Reading binary pedfile\n",
     "\n",
-    "### Getting the demo data"
+    "### Getting the demo data\n",
+    "\n",
+    "The [`Plink`](http://pngu.mgh.harvard.edu/~purcell/plink/) softwares provides a testing dataset on the [resources page](http://pngu.mgh.harvard.edu/~purcell/plink/res.shtml). It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18."
    ]
   },
   {
@@ -93,7 +98,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Reading the binary file"
+    "### Reading the binary file\n",
+    "\n",
+    "To read a binary file, `PyPlink` only requires the prefix of the files."
    ]
   },
   {
@@ -809,6 +816,395 @@
     "    maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)\n",
     "    print(marker_id, round(maf, 6), sep=\"\\t\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Writing binary pedfile\n",
+    "\n",
+    "The following examples shows how to write a binary file using the `PyPlink` module.\n",
+    "\n",
+    "> Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# The genotypes for 3 markers and 10 samples\n",
+    "all_genotypes = [\n",
+    "    [0, 0, 0, 1, 0, 0, -1, 2, 1, 0],\n",
+    "    [0, 0, 1, 1, 0, 0,  0, 1, 2, 0],\n",
+    "    [0, 0, 0, 0, 1, 1,  0, 0, 0, 1],\n",
+    "]\n",
+    "\n",
+    "# Writing the BED file using PyPlink\n",
+    "with PyPlink(\"test_output\", \"w\") as pedfile:\n",
+    "    for genotypes in all_genotypes:\n",
+    "        pedfile.write_marker(genotypes)\n",
+    "\n",
+    "# Writing a dummy FAM file\n",
+    "with open(\"test_output.fam\", \"w\") as fam_file:\n",
+    "    for i in range(10):\n",
+    "        print(\"family_{}\".format(i+1), \"sample_{}\".format(i+1), \"0\", \"0\", \"0\", \"-9\",\n",
+    "              sep=\" \", file=fam_file)\n",
+    "\n",
+    "# Writing a dummy BIM file\n",
+    "with open(\"test_output.bim\", \"w\") as bim_file:\n",
+    "    for i in range(3):\n",
+    "        print(\"1\", \"marker_{}\".format(i+1), \"0\", i+1, \"A\", \"T\",\n",
+    "              sep=\"\\t\", file=bim_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Checking the content of the newly created binary files\n",
+    "pedfile = PyPlink(\"test_output\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>fid</th>\n",
+       "      <th>iid</th>\n",
+       "      <th>father</th>\n",
+       "      <th>mother</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>status</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>family_1</td>\n",
+       "      <td>sample_1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>family_2</td>\n",
+       "      <td>sample_2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>family_3</td>\n",
+       "      <td>sample_3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>family_4</td>\n",
+       "      <td>sample_4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>family_5</td>\n",
+       "      <td>sample_5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>family_6</td>\n",
+       "      <td>sample_6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>family_7</td>\n",
+       "      <td>sample_7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>family_8</td>\n",
+       "      <td>sample_8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>family_9</td>\n",
+       "      <td>sample_9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>family_10</td>\n",
+       "      <td>sample_10</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-9</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         fid        iid father mother  gender  status\n",
+       "0   family_1   sample_1      0      0       0      -9\n",
+       "1   family_2   sample_2      0      0       0      -9\n",
+       "2   family_3   sample_3      0      0       0      -9\n",
+       "3   family_4   sample_4      0      0       0      -9\n",
+       "4   family_5   sample_5      0      0       0      -9\n",
+       "5   family_6   sample_6      0      0       0      -9\n",
+       "6   family_7   sample_7      0      0       0      -9\n",
+       "7   family_8   sample_8      0      0       0      -9\n",
+       "8   family_9   sample_9      0      0       0      -9\n",
+       "9  family_10  sample_10      0      0       0      -9"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pedfile.get_fam()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>chrom</th>\n",
+       "      <th>pos</th>\n",
+       "      <th>cm</th>\n",
+       "      <th>a1</th>\n",
+       "      <th>a2</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>snp</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>marker_1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>A</td>\n",
+       "      <td>T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>marker_2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>A</td>\n",
+       "      <td>T</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>marker_3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>A</td>\n",
+       "      <td>T</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          chrom  pos  cm a1 a2\n",
+       "snp                           \n",
+       "marker_1      1    1   0  A  T\n",
+       "marker_2      1    2   0  A  T\n",
+       "marker_3      1    3   0  A  T"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pedfile.get_bim()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "marker_1 [ 0  0  0  1  0  0 -1  2  1  0]\n",
+      "marker_2 [0 0 1 1 0 0 0 1 2 0]\n",
+      "marker_3 [0 0 0 0 1 1 0 0 0 1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for marker, genotypes in pedfile:\n",
+    "    print(marker, genotypes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The newly created binary files are compatible with Plink."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "@----------------------------------------------------------@\n",
+      "|        PLINK!       |     v1.07      |   10/Aug/2009     |\n",
+      "|----------------------------------------------------------|\n",
+      "|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |\n",
+      "|----------------------------------------------------------|\n",
+      "|  For documentation, citation & bug-report instructions:  |\n",
+      "|        http://pngu.mgh.harvard.edu/purcell/plink/        |\n",
+      "@----------------------------------------------------------@\n",
+      "\n",
+      "Skipping web check... [ --noweb ] \n",
+      "Writing this text to log file [ plink.log ]\n",
+      "Analysis started: Tue Nov  3 11:12:07 2015\n",
+      "\n",
+      "Options in effect:\n",
+      "\t--noweb\n",
+      "\t--bfile test_output\n",
+      "\t--freq\n",
+      "\n",
+      "Reading map (extended format) from [ test_output.bim ] \n",
+      "3 markers to be included from [ test_output.bim ]\n",
+      "Reading pedigree information from [ test_output.fam ] \n",
+      "10 individuals read from [ test_output.fam ] \n",
+      "0 individuals with nonmissing phenotypes\n",
+      "Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n",
+      "Missing phenotype value is also -9\n",
+      "0 cases, 0 controls and 10 missing\n",
+      "0 males, 0 females, and 10 of unspecified sex\n",
+      "Warning, found 10 individuals with ambiguous sex codes\n",
+      "These individuals will be set to missing ( or use --allow-no-sex )\n",
+      "Writing list of these individuals to [ plink.nosex ]\n",
+      "Reading genotype bitfile from [ test_output.bed ] \n",
+      "Detected that binary PED file is v1.00 SNP-major mode\n",
+      "Before frequency and genotyping pruning, there are 3 SNPs\n",
+      "10 founders and 0 non-founders found\n",
+      "Writing allele frequencies (founders-only) to [ plink.frq ] \n",
+      "\n",
+      "Analysis finished: Tue Nov  3 11:12:10 2015\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from subprocess import Popen, PIPE\n",
+    "\n",
+    "# Computing frequencies\n",
+    "proc = Popen([\"plink\", \"--noweb\", \"--bfile\", \"test_output\", \"--freq\"],\n",
+    "             stdout=PIPE, stderr=PIPE)\n",
+    "outs, errs = proc.communicate()\n",
+    "print(outs.decode(), end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " CHR        SNP   A1   A2          MAF  NCHROBS\n",
+      "   1   marker_1    A    T       0.2222       18\n",
+      "   1   marker_2    A    T         0.25       20\n",
+      "   1   marker_3    A    T         0.15       20\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"plink.frq\", \"r\") as i_file:\n",
+    "    print(i_file.read(), end=\"\")"
+   ]
   }
  ],
  "metadata": {

From 7d7fe48cf7e4e759ad45b18c02fef3bb15d8fad1 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 11:17:16 -0500
Subject: [PATCH 09/13] Wrong link

---
 demo/PyPlink Demo.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index 8287828..0db9fcf 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -55,7 +55,7 @@
     "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n",
     "\n",
     "\n",
-    "* [**Writing binary pedfile**](Writing-binary-pedfile)"
+    "* [**Writing binary pedfile**](#Writing-binary-pedfile)"
    ]
   },
   {

From 025a91f18a896d986211181bc20c131147ceb279 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 11:19:58 -0500
Subject: [PATCH 10/13] Link did not work

---
 demo/PyPlink Demo.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index 0db9fcf..bd193cc 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -55,7 +55,7 @@
     "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n",
     "\n",
     "\n",
-    "* [**Writing binary pedfile**](#Writing-binary-pedfile)"
+    "* [**Writing binary pedfile**](#Writing-binary-pedfile)\n"
    ]
   },
   {

From 17d292a9a3d29206b0e6655cf1f289eea0c707c9 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 11:23:07 -0500
Subject: [PATCH 11/13] Added a tag

---
 demo/PyPlink Demo.ipynb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index bd193cc..4367a72 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -55,7 +55,7 @@
     "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n",
     "\n",
     "\n",
-    "* [**Writing binary pedfile**](#Writing-binary-pedfile)\n"
+    "* [**Writing binary pedfile**](#writing_binary)\n"
    ]
   },
   {
@@ -821,6 +821,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "<a id=\"writing_binary\"></a>\n",
     "## Writing binary pedfile\n",
     "\n",
     "The following examples shows how to write a binary file using the `PyPlink` module.\n",

From c83a9dc634c4751be83972ad8b0d1608bc18aba7 Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 11:27:08 -0500
Subject: [PATCH 12/13] Typo + better link

---
 demo/PyPlink Demo.ipynb | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb
index 4367a72..e1b9694 100644
--- a/demo/PyPlink Demo.ipynb	
+++ b/demo/PyPlink Demo.ipynb	
@@ -55,7 +55,7 @@
     "        * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n",
     "\n",
     "\n",
-    "* [**Writing binary pedfile**](#writing_binary)\n"
+    "* [**Writing binary pedfile**](#Writing-binary-pedfile)"
    ]
   },
   {
@@ -821,12 +821,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id=\"writing_binary\"></a>\n",
     "## Writing binary pedfile\n",
     "\n",
     "The following examples shows how to write a binary file using the `PyPlink` module.\n",
     "\n",
-    "> Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` file."
+    "> Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files."
    ]
   },
   {

From 76f3162c94f92ed2bbe8f46ea539dd06b0b9b85e Mon Sep 17 00:00:00 2001
From: Louis-Philippe Lemieux Perreault
 <louis-philippe.lemieux.perreault@statgen.org>
Date: Tue, 3 Nov 2015 14:29:43 -0500
Subject: [PATCH 13/13] For next release

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b32ea48..990aef8 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,8 @@
 
 
 MAJOR = 1
-MINOR = 0
-MICRO = 2
+MINOR = 1
+MICRO = 0
 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO)