Skip to content

Commit

Permalink
v1.0.4
Browse files Browse the repository at this point in the history
  • Loading branch information
jmschrei committed Mar 11, 2024
1 parent a446005 commit 2437323
Show file tree
Hide file tree
Showing 7 changed files with 172 additions and 21 deletions.
10 changes: 10 additions & 0 deletions docs/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
Release History
===============

Version 1.0.4
==============

Highlights
----------

- Fixed an issue with Markov Chains and ConditionalCategorical distributions
- Added more documentation to ConditionalCategorical distributions


Version 1.0.3
==============

Expand Down
2 changes: 1 addition & 1 deletion pomegranate/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.3"
__version__ = "1.0.4"
11 changes: 11 additions & 0 deletions pomegranate/distributions/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ class Categorical(Distribution):
Probabilities for each key for each feature, where k is the largest
number of keys across all features. Default is None
n_categories: list, numpy.ndarray, torch.tensor or None, optional
The number of categories for each feature in the data. Only needs to
be provided when the parameters will be learned directly from data and
you want to make sure that right number of keys are included in each
dimension. Default is None.
pseudocount: float, optional
A value to add to the observed counts of each feature when training.
Setting this to a positive value ensures that no probabilities are
truly zero. Default is 0.
inertia: float, (0, 1), optional
Indicates the proportion of the update to apply to the parameters
during training. When the inertia is 0.0, the update is applied in
Expand Down
88 changes: 86 additions & 2 deletions pomegranate/distributions/conditional_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,69 @@
from ._distribution import ConditionalDistribution
from .categorical import Categorical


class ConditionalCategorical(ConditionalDistribution):
"""Still under development."""

"""A conditional categorical distribution.
This is a categorical distribution that is conditioned on previous
emissions, meaning that the probability of each character depends on the
observed character earlier in the sequence. Each feature is conditioned
independently of the others like a `Categorical` distribution.
This conditioning makes the shape of the distribution a bit more
complicated than the `JointCategorical` distribution. Specifically, a
`JointCategorical` distribution is multivariate by definition but a
`ConditionalCategorical` does not have to be. Although both may appear
similar in that they both take in a vector of characters and return
probabilities, the vector fed into the JointCategorical are all observed
together without some notion of time, whereas the ConditionalCategorical
explicitly requires a notion of timing, where the probability of later
characters depend on the composition of characters seen before.
Parameters
----------
probs: list of numpy.ndarray, torch.tensor or None, shape=(k, k), optional
A list of conditional probabilities with one tensor for each feature
in the data being modeled. Each tensor should have `k+1` dimensions
where `k` is the number of timesteps to condition on. Each dimension
should span the number of keys in that dimension. For example, if
specifying a univariate conditional categorical distribution where
k=2, a valid tensor shape would be [(2, 3, 4)]. Default is None.
n_categories: list, numpy.ndarray, torch.tensor or None, optional
The number of categories for each feature in the data. Only needs to
be provided when the parameters will be learned directly from data and
you want to make sure that right number of keys are included in each
dimension. Unlike the `Categorical` distribution, this needs to be
a list of shapes with one shape for each feature and the shape matches
that specified in `probs`. Default is None.
pseudocount: float, optional
A value to add to the observed counts of each feature when training.
Setting this to a positive value ensures that no probabilities are
truly zero. Default is 0.
inertia: float, (0, 1), optional
Indicates the proportion of the update to apply to the parameters
during training. When the inertia is 0.0, the update is applied in
its entirety and the previous parameters are ignored. When the
inertia is 1.0, the update is entirely ignored and the previous
parameters are kept, equivalently to if the parameters were frozen.
frozen: bool, optional
Whether all the parameters associated with this distribution are frozen.
If you want to freeze individual pameters, or individual values in those
parameters, you must modify the `frozen` attribute of the tensor or
parameter directly. Default is False.
check_data: bool, optional
Whether to check properties of the data and potentially recast it to
torch.tensors. This does not prevent checking of parameters but can
slightly speed up computation when you know that your inputs are valid.
Setting this to False is also necessary for compiling.
"""

def __init__(self, probs=None, n_categories=None, pseudocount=0,
inertia=0.0, frozen=False, check_data=True):
super().__init__(inertia=inertia, frozen=frozen, check_data=check_data)
Expand Down Expand Up @@ -47,6 +107,22 @@ def __init__(self, probs=None, n_categories=None, pseudocount=0,
self._reset_cache()

def _initialize(self, d, n_categories):
"""Initialize the probability distribution.
This method is meant to only be called internally. It initializes the
parameters of the distribution and stores its dimensionality. For more
complex methods, this function will do more.
Parameters
----------
d: int
The dimensionality the distribution is being initialized to.
n_categories: list of tuples
The shape of each conditional distribution, one per feature.
"""

self.n_categories = []
for n_cat in n_categories:
if isinstance(n_cat, (list, tuple)):
Expand All @@ -63,6 +139,14 @@ def _initialize(self, d, n_categories):
super()._initialize(d)

def _reset_cache(self):
"""Reset the internally stored statistics.
This method is meant to only be called internally. It resets the
stored statistics used to update the model parameters as well as
recalculates the cached values meant to speed up log probability
calculations.
"""

if self._initialized == False:
return

Expand Down
47 changes: 31 additions & 16 deletions pomegranate/markov_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ class MarkovChain(Distribution):
the number of steps back to model in the sequence. This must be passed
in if the distributions are not passed in.
n_categories: list, numpy.ndarray, torch.tensor, or None, shape=(d,)
A vector with the maximum number of categories that each column
can have. If not given, this will be inferred from the data. Default
is None.
n_categories: list, tuple, or None
A list or tuple containing the number of categories that each feature
has.
inertia: float, [0, 1], optional
Indicates the proportion of the update to apply to the parameters
Expand Down Expand Up @@ -82,15 +81,12 @@ def __init__(self, distributions=None, k=None, n_categories=None,

if distributions is not None:
self.k = len(distributions) - 1

if n_categories is None:
self.n_categories = [None for i in range(self.k+1)]

self.d = None
self._initialized = distributions is not None and distributions[0]._initialized
self._reset_cache()

def _initialize(self, d):
def _initialize(self, d, n_categories):
"""Initialize the probability distribution.
This method is meant to only be called internally. It initializes the
Expand All @@ -102,14 +98,24 @@ def _initialize(self, d):
----------
d: int
The dimensionality the distribution is being initialized to.
n_categories: int
The maximum number of categories to model. This single number is
used as the maximum across all features and all timesteps.
"""

self.distributions = [Categorical(n_categories=self.n_categories[0])]
for i in range(self.k):
distribution = ConditionalCategorical(
n_categories=self.n_categories[i+1])
self.distributions.append(distribution)
if self.distributions is None:
self.distributions = [Categorical()]
self.distributions[0]._initialize(d, max(n_categories))

for i in range(self.k):
distribution = ConditionalCategorical()
distribution._initialize(d, [[n_categories[j]]*(i+2)
for j in range(d)])

self.distributions.append(distribution)

self.n_categories = n_categories
self._initialized = True
super()._initialize(d)

Expand Down Expand Up @@ -246,15 +252,24 @@ def summarize(self, X, sample_weight=None):
if self.frozen:
return

if not self._initialized:
self._initialize(len(X[0]))

X = _check_parameter(_cast_as_tensor(X), "X", ndim=3,
check_parameter=self.check_data)
sample_weight = _check_parameter(_cast_as_tensor(sample_weight),
"sample_weight", min_value=0, ndim=(1, 2),
check_parameter=self.check_data)

if not self._initialized:
if self.n_categories is not None:
n_keys = self.n_categories
elif isinstance(X, torch.masked.MaskedTensor):
n_keys = (torch.max(torch.max(X._masked_data, dim=0)[0],
dim=0)[0] + 1).type(torch.int32)
else:
n_keys = (torch.max(torch.max(X, dim=0)[0], dim=0)[0] + 1).type(
torch.int32)

self._initialize(len(X[0][0]), n_keys)

if sample_weight is None:
sample_weight = torch.ones_like(X[:, 0])
elif len(sample_weight.shape) == 1:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='pomegranate',
version='1.0.3',
version='1.0.4',
author='Jacob Schreiber',
author_email='jmschreiber91@gmail.com',
packages=['pomegranate', 'pomegranate.distributions', 'pomegranate.hmm'],
Expand Down
33 changes: 32 additions & 1 deletion tests/test_markov_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_initialize(X):
assert model.k == 1
assert model._initialized == False

model._initialize(2)
model._initialize(2, ((3,)))
assert model._initialized == True
assert model.d == 2
assert model.k == 1
Expand Down Expand Up @@ -827,6 +827,37 @@ def test_fit(model, X):
[ -inf, -0.6931, -0.6931]]], 4)


def test_fit_k(model, X):
numpy.random.seed(137)
seq_data = numpy.random.randint(0, 10, (1,10,1))


model = MarkovChain(k=1)
model.fit(seq_data)

assert_array_almost_equal(model.distributions[1].probs[0],
[[0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
1.0000],
[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
0.1000],
[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
0.1000],
[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
0.1000],
[0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000, 0.3333, 0.3333, 0.0000,
0.0000],
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
1.0000],
[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
0.1000],
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.5000, 0.0000,
0.0000]], 4)


def test_fit_weighted(model, X, w):
model.fit(X, sample_weight=w)

Expand Down

0 comments on commit 2437323

Please sign in to comment.