v1.0.4

jmschrei · Mar 11, 2024 · 2437323 · 2437323
1 parent a446005
commit 2437323
Show file tree

Hide file tree

Showing 7 changed files with 172 additions and 21 deletions.
diff --git a/docs/whats_new.rst b/docs/whats_new.rst
@@ -5,6 +5,16 @@
 Release History
 ===============
 
+Version 1.0.4
+==============
+
+Highlights
+----------
+
+	- Fixed an issue with Markov Chains and ConditionalCategorical distributions
+	- Added more documentation to ConditionalCategorical distributions
+
+
 Version 1.0.3
 ==============
 

diff --git a/pomegranate/__init__.py b/pomegranate/__init__.py
@@ -1 +1 @@
-__version__ = "1.0.3"
+__version__ = "1.0.4"
diff --git a/pomegranate/distributions/categorical.py b/pomegranate/distributions/categorical.py
@@ -38,6 +38,17 @@ class Categorical(Distribution):
 		Probabilities for each key for each feature, where k is the largest
 		number of keys across all features. Default is None
 
+	n_categories: list, numpy.ndarray, torch.tensor or None, optional
+		The number of categories for each feature in the data. Only needs to
+		be provided when the parameters will be learned directly from data and
+		you want to make sure that right number of keys are included in each
+		dimension. Default is None.
+
+	pseudocount: float, optional
+		A value to add to the observed counts of each feature when training.
+		Setting this to a positive value ensures that no probabilities are
+		truly zero. Default is 0.
+
 	inertia: float, (0, 1), optional
 		Indicates the proportion of the update to apply to the parameters
 		during training. When the inertia is 0.0, the update is applied in

diff --git a/pomegranate/distributions/conditional_categorical.py b/pomegranate/distributions/conditional_categorical.py
@@ -16,9 +16,69 @@
 from ._distribution import ConditionalDistribution
 from .categorical import Categorical
 
+
 class ConditionalCategorical(ConditionalDistribution):
-	"""Still under development."""
-
+	"""A conditional categorical distribution.
+
+	This is a categorical distribution that is conditioned on previous
+	emissions, meaning that the probability of each character depends on the
+	observed character earlier in the sequence. Each feature is conditioned
+	independently of the others like a `Categorical` distribution. 
+
+	This conditioning makes the shape of the distribution a bit more
+	complicated than the `JointCategorical` distribution. Specifically, a 
+	`JointCategorical` distribution is multivariate by definition but a
+	`ConditionalCategorical` does not have to be. Although both may appear 
+	similar in that they both take in a vector of characters and return 
+	probabilities, the vector fed into the JointCategorical are all observed 
+	together without some notion of time, whereas the ConditionalCategorical 
+	explicitly requires a notion of timing, where the probability of later 
+	characters depend on the composition of characters seen before.
+
+
+	Parameters
+	----------
+	probs: list of numpy.ndarray, torch.tensor or None, shape=(k, k), optional
+		A list of conditional probabilities with one tensor for each feature
+		in the data being modeled. Each tensor should have `k+1` dimensions 
+		where `k` is the number of timesteps to condition on. Each dimension
+		should span the number of keys in that dimension. For example, if
+		specifying a univariate conditional categorical distribution where
+		k=2, a valid tensor shape would be [(2, 3, 4)]. Default is None.
+
+	n_categories: list, numpy.ndarray, torch.tensor or None, optional
+		The number of categories for each feature in the data. Only needs to
+		be provided when the parameters will be learned directly from data and
+		you want to make sure that right number of keys are included in each
+		dimension. Unlike the `Categorical` distribution, this needs to be
+		a list of shapes with one shape for each feature and the shape matches
+		that specified in `probs`. Default is None.
+
+	pseudocount: float, optional
+		A value to add to the observed counts of each feature when training.
+		Setting this to a positive value ensures that no probabilities are
+		truly zero. Default is 0.
+
+	inertia: float, (0, 1), optional
+		Indicates the proportion of the update to apply to the parameters
+		during training. When the inertia is 0.0, the update is applied in
+		its entirety and the previous parameters are ignored. When the
+		inertia is 1.0, the update is entirely ignored and the previous
+		parameters are kept, equivalently to if the parameters were frozen.
+
+	frozen: bool, optional
+		Whether all the parameters associated with this distribution are frozen.
+		If you want to freeze individual pameters, or individual values in those
+		parameters, you must modify the `frozen` attribute of the tensor or
+		parameter directly. Default is False.
+
+	check_data: bool, optional
+		Whether to check properties of the data and potentially recast it to
+		torch.tensors. This does not prevent checking of parameters but can
+		slightly speed up computation when you know that your inputs are valid.
+		Setting this to False is also necessary for compiling.
+	"""
+
 	def __init__(self, probs=None, n_categories=None, pseudocount=0, 
 		inertia=0.0, frozen=False, check_data=True):
 		super().__init__(inertia=inertia, frozen=frozen, check_data=check_data)
@@ -47,6 +107,22 @@ def __init__(self, probs=None, n_categories=None, pseudocount=0,
 		self._reset_cache()
 
 	def _initialize(self, d, n_categories):
+		"""Initialize the probability distribution.
+
+		This method is meant to only be called internally. It initializes the
+		parameters of the distribution and stores its dimensionality. For more
+		complex methods, this function will do more.
+
+
+		Parameters
+		----------
+		d: int
+			The dimensionality the distribution is being initialized to.
+
+		n_categories: list of tuples
+			The shape of each conditional distribution, one per feature.
+		"""
+
 		self.n_categories = []
 		for n_cat in n_categories:
 			if isinstance(n_cat, (list, tuple)):
@@ -63,6 +139,14 @@ def _initialize(self, d, n_categories):
 		super()._initialize(d)
 
 	def _reset_cache(self):
+		"""Reset the internally stored statistics.
+
+		This method is meant to only be called internally. It resets the
+		stored statistics used to update the model parameters as well as
+		recalculates the cached values meant to speed up log probability
+		calculations.
+		"""
+
 		if self._initialized == False:
 			return
 

diff --git a/pomegranate/markov_chain.py b/pomegranate/markov_chain.py
@@ -40,10 +40,9 @@ class MarkovChain(Distribution):
 		the number of steps back to model in the sequence. This must be passed
 		in if the distributions are not passed in.
 
-	n_categories: list, numpy.ndarray, torch.tensor, or None, shape=(d,)
-		A vector with the maximum number of categories that each column
-		can have. If not given, this will be inferred from the data. Default
-		is None.
+	n_categories: list, tuple, or None
+		A list or tuple containing the number of categories that each feature
+		has. 
 
 	inertia: float, [0, 1], optional
 		Indicates the proportion of the update to apply to the parameters
@@ -82,15 +81,12 @@ def __init__(self, distributions=None, k=None, n_categories=None,
 
 		if distributions is not None:
 			self.k = len(distributions) - 1
-
-		if n_categories is None:
-			self.n_categories = [None for i in range(self.k+1)]
 
 		self.d = None
 		self._initialized = distributions is not None and distributions[0]._initialized
 		self._reset_cache()
 
-	def _initialize(self, d):
+	def _initialize(self, d, n_categories):
 		"""Initialize the probability distribution.
 
 		This method is meant to only be called internally. It initializes the
@@ -102,14 +98,24 @@ def _initialize(self, d):
 		----------
 		d: int
 			The dimensionality the distribution is being initialized to.
+
+		n_categories: int
+			The maximum number of categories to model. This single number is
+			used as the maximum across all features and all timesteps.
 		"""
 
-		self.distributions = [Categorical(n_categories=self.n_categories[0])]
-		for i in range(self.k):
-			distribution = ConditionalCategorical(
-				n_categories=self.n_categories[i+1])
-			self.distributions.append(distribution)
+		if self.distributions is None:
+			self.distributions = [Categorical()]
+			self.distributions[0]._initialize(d, max(n_categories))
+
+			for i in range(self.k):
+				distribution = ConditionalCategorical()
+				distribution._initialize(d, [[n_categories[j]]*(i+2) 
+					for j in range(d)])
+
+				self.distributions.append(distribution)
 
+		self.n_categories = n_categories
 		self._initialized = True
 		super()._initialize(d)
 
@@ -246,15 +252,24 @@ def summarize(self, X, sample_weight=None):
 		if self.frozen:
 			return
 
-		if not self._initialized:
-			self._initialize(len(X[0]))
-
 		X = _check_parameter(_cast_as_tensor(X), "X", ndim=3, 
 			check_parameter=self.check_data)
 		sample_weight = _check_parameter(_cast_as_tensor(sample_weight), 
 			"sample_weight", min_value=0, ndim=(1, 2), 
 			check_parameter=self.check_data)
 
+		if not self._initialized:
+			if self.n_categories is not None:
+				n_keys = self.n_categories
+			elif isinstance(X, torch.masked.MaskedTensor):
+				n_keys = (torch.max(torch.max(X._masked_data, dim=0)[0], 
+					dim=0)[0] + 1).type(torch.int32)
+			else:
+				n_keys = (torch.max(torch.max(X, dim=0)[0], dim=0)[0] + 1).type(
+					torch.int32)
+
+			self._initialize(len(X[0][0]), n_keys)
+
 		if sample_weight is None:
 			sample_weight = torch.ones_like(X[:, 0])
 		elif len(sample_weight.shape) == 1: 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
 	name='pomegranate',
-	version='1.0.3',
+	version='1.0.4',
 	author='Jacob Schreiber',
 	author_email='jmschreiber91@gmail.com',
 	packages=['pomegranate', 'pomegranate.distributions', 'pomegranate.hmm'],

diff --git a/tests/test_markov_chain.py b/tests/test_markov_chain.py
@@ -94,7 +94,7 @@ def test_initialize(X):
 	assert model.k == 1
 	assert model._initialized == False
 
-	model._initialize(2)
+	model._initialize(2, ((3,)))
 	assert model._initialized == True
 	assert model.d == 2
 	assert model.k == 1
@@ -827,6 +827,37 @@ def test_fit(model, X):
           [   -inf, -0.6931, -0.6931]]], 4)
 
 
+def test_fit_k(model, X):
+	numpy.random.seed(137)
+	seq_data = numpy.random.randint(0, 10, (1,10,1))
+
+
+	model = MarkovChain(k=1)
+	model.fit(seq_data)
+
+	assert_array_almost_equal(model.distributions[1].probs[0],
+	   [[0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+         0.0000],
+        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+         1.0000],
+        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
+         0.1000],
+        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+         0.0000],
+        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
+         0.1000],
+        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
+         0.1000],
+        [0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000, 0.3333, 0.3333, 0.0000,
+         0.0000],
+        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+         1.0000],
+        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
+         0.1000],
+        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.5000, 0.0000,
+         0.0000]], 4)
+
+
 def test_fit_weighted(model, X, w):
 	model.fit(X, sample_weight=w)