Skip to content

Commit

Permalink
Added a wig file parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
legaultmarc committed Sep 4, 2015
1 parent 3d8c1ab commit 787ee3f
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 2 deletions.
120 changes: 120 additions & 0 deletions gepyto/formats/wig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""
Parser for Wiggle Track Format files.
"""

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
"Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import os

import pandas as pd
import six

from ..structures.region import _Segment, Region



class WiggleFile(object):
"""Parser for WIG files.
This returns a pandas dataframe with all the necessary information. In the
process, all the inherent compactness of the Wiggle format is lost in
exchange for an easier to manage representation. This means that more
efficient parsers should be used for large chunks of data.
"""
def __init__(self, stream):
self.stream = stream
if isinstance(stream, six.string_types):
if os.path.isfile(stream):
self.stream = open(stream, "r")
else:
raise IOError("Can't find file '{}'.".format(stream))

mode, first_header = self._parse_header(next(self.stream))
if mode == "fixedStep":
self.data = self._parse_fixed_step(header=first_header)
else:
raise NotImplementedError("fixedStep is the only implemented mode "
"for now.")

# Use categories for chrom to save space.
self.data["chrom"] = self.data["chrom"].astype("category")

# Check if regions or only 1 bases
# If so use pos instead of start, end.
if (self.data["start"] == self.data["end"]).all():
self.data = self.data.drop("end", axis=1)
self.data.columns = ("chrom", "pos", "value")

def __enter__(self):
return self

def __exit__(self, *params):
self.close()

def close(self):
# This will close the file if it's a file.
try:
self.stream.close()
except AttributeError:
pass

def as_dataframe(self):
return self.data


def _parse_fixed_step(self, header=None):
data = []
for line in self.stream:
if self._is_header(line):
mode, header = self._parse_header(line)
assert (
mode == "fixedStep"
), "Can't change mode after parsing started."

else:
data.append((
header["chrom"],
header["pos"],
header["pos"] + header["span"] - 1,
float(line.rstrip())
))
header["pos"] += header["step"]

return pd.DataFrame(
data,
columns=("chrom", "start", "end", "value")
)


@staticmethod
def _parse_header(line):
line = line.rstrip().split()
mode = line[0]

line = line[1:]
header = dict([field.split("=") for field in line])
header["start"] = int(header["start"])
header["step"] = int(header["step"])
header["span"] = int(header.get("span", 1))

header["pos"] = header["start"]

return mode, header

@staticmethod
def _is_header(line):
return (
line.startswith("variableStep") or line.startswith("fixedStep")
)
17 changes: 15 additions & 2 deletions gepyto/structures/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
"Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"

__all__ = ["Region", "get_centromere", "get_telomere"]


import re

from .. import settings
from . import sequences

import numpy as np
from six.moves import range


class _Segment(object):
def __init__(self, chrom, start, end):
Expand All @@ -46,6 +47,11 @@ def distance_to(self, segment):
else:
return self.start - segment.end

def as_range(self, iterator=False):
if iterator:
return range(self.start, self.end + 1)
return np.arange(self.start, self.end + 1)

def __eq__(self, seg):
return (self.chrom == seg.chrom and self.start == seg.start and
self.end == seg.end)
Expand Down Expand Up @@ -171,6 +177,13 @@ def distance_to(self, region):
min_dist = d
return min_dist

def as_range(self, iterator=False):
"""Get a range corresponding to all the nucleotide positions."""
if self.is_contiguous:
return self.segments[0].as_range(iterator)
else:
return set([seg.as_range(iterator) for seg in self.segments])

@property
def chrom(self):
if self.is_contiguous:
Expand Down

0 comments on commit 787ee3f

Please sign in to comment.