-
Notifications
You must be signed in to change notification settings - Fork 584
/
bayesian_network.py
1109 lines (831 loc) · 36 KB
/
bayesian_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# bayesian_network.py
# Author: Jacob Schreiber <jmschreiber91@gmail.com>
import time
import numpy
import torch
import itertools
import networkx as nx
from ._utils import _cast_as_tensor
from ._utils import _cast_as_parameter
from ._utils import _update_parameter
from ._utils import _check_parameter
from ._utils import _reshape_weights
from .distributions._distribution import Distribution
from .distributions import Categorical
from .distributions import JointCategorical
from .distributions import ConditionalCategorical
from .factor_graph import FactorGraph
class BayesianNetwork(Distribution):
"""A Bayesian network object.
A Bayesian network is a probability distribution where dependencies between
variables are explicitly encoded in a graph structure and the lack of an
edge represents a conditional independence. These graphs are directed and
typically must be acyclic, but this implementation allows for the networks
to be cyclic as long as there is no assumption of convergence during
inference.
Inference is doing using loopy belief propagation along a factor graph
representation. This is sometimes called the `sum-product` algorithm.
It will yield exact results if the graph has a tree-like structure.
Otherwise, if the graph is acyclic, it is guaranteed to converge but not
necessarily to optimal results. If the graph is cyclic, there is no
guarantee on convergence, but it is thought that the longer the loop is the
more likely one will get good results.
Structure learning can be done using a variety of methods.
Parameters
----------
distributions: tuple or list or None
A set of distribution objects. These do not need to be initialized,
i.e. can be "Categorical()". Currently, they must be either Categorical
or JointCategorical distributions. If provided, they must be consistent
with the provided edges in that every conditional distribution must
have at least one parent in the provided structure. Default is None.
edges: tuple or list or None, optional
A list or tuple of 2-tuples where the first element in the 2-tuple is
the parent distribution object and the second element is the child
distribution object. If None, then no edges. Default is None.
structure: tuple or list or None, optional
A list or tuple of the parents for each distribution with a tuple
containing no elements indicating a root node. For instance,
((), (0,), (), (0, 2)) would represent a graph with four nodes,
where the second distribution has the first distribution as a parent
and the fourth distribution has the first and third distributions as
parents. Use this only when you want new distribution objects to be
created and fit when using the `fit` method. Default is None.
max_iter: int, optional
The number of iterations to do in the inference step. Default is 10.
tol: float, optional
The threshold at which to stop during fitting when the improvement
goes under. Default is 1e-6.
inertia: float, [0, 1], optional
Indicates the proportion of the update to apply to the parameters
during fitting. When the inertia is 0.0, the update is applied in
its entirety and the previous parameters are ignored. When the
inertia is 1.0, the update is entirely ignored and the previous
parameters are kept, equivalently to if the parameters were frozen.
frozen: bool, optional
Whether all the parameters associated with this distribution are frozen.
If you want to freeze individual pameters, or individual values in those
parameters, you must modify the `frozen` attribute of the tensor or
parameter directly. Default is False.
check_data: bool, optional
Whether to check properties of the data and potentially recast it to
torch.tensors. This does not prevent checking of parameters but can
slightly speed up computation when you know that your inputs are valid.
Setting this to False is also necessary for compiling. Default is True.
verbose: bool, optional
Whether to print the improvement and timings during training.
"""
def __init__(self, distributions=None, edges=None, structure=None,
algorithm=None, include_parents=None, exclude_parents=None,
max_parents=None, pseudocount=0.0, max_iter=20, tol=1e-6, inertia=0.0,
frozen=False, check_data=True, verbose=False):
super().__init__(inertia=inertia, frozen=frozen, check_data=check_data)
self.name = "BayesianNetwork"
self.distributions = torch.nn.ModuleList([])
self.edges = []
self.structure = structure
self._marginal_mapping = {}
self._factor_mapping = {}
self._distribution_mapping = {}
self._parents = []
self._factor_graph = FactorGraph()
self.algorithm = algorithm
self.include_parents = include_parents
self.exclude_parents = exclude_parents
self.max_parents = max_parents
self.pseudocount = pseudocount
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
self.d = 0
self._initialized = (distributions is not None and
distributions[0]._initialized)
self._reset_cache()
if distributions is not None:
_check_parameter(distributions, "factors", dtypes=(list, tuple))
for distribution in distributions:
self.add_distribution(distribution)
if edges is not None:
_check_parameter(edges, "edges", dtypes=(list, tuple))
if isinstance(edges, (tuple, list)):
for parent, child in edges:
self.add_edge(parent, child)
else:
raise ValueError("Edges must be tuple or list.")
def _initialize(self, d):
self._initialized = True
super()._initialize(d)
def _reset_cache(self):
return
def add_distribution(self, distribution):
"""Adds a distribution to the set of distributions.
Adds a distribution to the set of distributions being stored in the
BayesianNetwork object but also updates the underlying factor graph,
adding in a marginal node and a factor node.
Parameters
----------
distribution: pomegranate.distributions.Distribution
A distribution object to include as a node. Currently must be a
Categorical or a ConditionalCategorical distribution.
"""
if not isinstance(distribution, (Categorical, ConditionalCategorical)):
raise ValueError("Must be Categorical or ConditionalCategorical")
self.distributions.append(distribution)
self.d += 1
# Create the marginal distribution in the factor graph
n_keys = distribution.probs[0].shape[-1]
marginal = Categorical(torch.ones(1, n_keys) / n_keys)
# Add the marginal and keep track of it
self._factor_graph.add_marginal(marginal)
self._marginal_mapping[distribution] = marginal
self._parents.append(tuple())
self._distribution_mapping[distribution] = len(self.distributions) - 1
# If a conditional distribution, calculate the joint distribution
# assuming uniform distributions for all the parents
if isinstance(distribution, ConditionalCategorical):
p = torch.clone(distribution.probs[0])
p /= torch.prod(torch.tensor(p.shape[:-1]))
factor = JointCategorical(p)
# Add the factor to the factor graph
self._factor_graph.add_factor(factor)
self._factor_mapping[distribution] = factor
else:
# Otherwise, we add the univariate distribution as the factor
self._factor_graph.add_factor(distribution)
self._factor_mapping[distribution] = distribution
factor = distribution
self._factor_graph.add_edge(marginal, factor)
def add_distributions(self, distributions):
"""Adds several distributions to the set of distributions.
Parameters
----------
distribution: iterable
Any object that can be iterated over that returns distributions.
Must be Categorical or ConditionalCategorical distributions.
"""
for distribution in distributions:
self.add_distribution(distribution)
def add_edge(self, parent, child):
"""Adds a directed edge from the parent to the child node.
Adds an edge to the list of edges associated with the BayesianNetwork
object but also adds an appropriate edge in the underlying factor
graph between the marginal node associated with the parent and
the factor node associated with the child.
Parameters
----------
parent: pomegranate.distributions.Distribution
The distribution that the edge begins at.
child: pomegranate.distributions.Distribution
The distribution that the edge points to.
"""
if not isinstance(child, ConditionalCategorical):
raise ValueError("Child distribution must be conditional.")
if parent not in self._marginal_mapping:
raise ValueError("Parent distribution must be in network.")
if child not in self._marginal_mapping:
raise ValueError("Child distribution must be in network.")
if parent is child:
raise ValueError("Cannot have self-loops.")
self.edges.append((parent, child))
p_idx = self._distribution_mapping[parent]
c_idx = self._distribution_mapping[child]
self._parents[c_idx] += (p_idx,)
# Get the respective marginal and factor distributions and their idxs
marginal = self._marginal_mapping[parent]
factor = self._factor_mapping[child]
m_idx = self._factor_graph._marginal_idxs[marginal]
f_idx = self._factor_graph._factor_idxs[factor]
# We need to keep this edge as the last one so pop it and re-add it
m = self._factor_graph._factor_edges[f_idx].pop()
f = self._factor_graph._marginal_edges[m_idx].pop()
# Add the new edge and then the previous edge
self._factor_graph.add_edge(marginal, factor)
self._factor_graph._factor_edges[f_idx].append(m)
self._factor_graph._marginal_edges[m_idx].append(f)
def add_edges(self, edges):
"""Adds several edges to the network at once.
Parameters
----------
edges: iterable
Any object that can be iterated over that returns tuples with
a pair of distributions.
"""
for edge in edges:
self.add_edge(*edge)
def sample(self, n):
"""Sample from the probability distribution.
This method will return `n` samples generated from the underlying
probability distribution. For a mixture model, this involves first
sampling the component using the prior probabilities, and then sampling
from the chosen distribution.
Parameters
----------
n: int
The number of samples to generate.
Returns
-------
X: torch.tensor, shape=(n, self.d)
Randomly generated samples.
"""
X = torch.zeros(n, self.d, dtype=torch.int32) - 1
for i in range(self.d):
for j, parents in enumerate(self._parents):
if (X[0, j] != -1).item():
continue
if len(parents) == 0:
X[:, j] = self.distributions[j].sample(n)[:, 0]
else:
X_ = X[:, parents].unsqueeze(-1)
if (X_ == -1).any().item():
continue
X[:, j] = self.distributions[j].sample(n, X_)[:, 0]
return X
def log_probability(self, X):
"""Calculate the log probability of each example.
This method calculates the log probability of each example given the
parameters of the distribution. The examples must be given in a 2D
format.
Parameters
----------
X: list, tuple, numpy.ndarray, torch.Tensor, shape=(-1, self.d)
A set of examples to evaluate.
Returns
-------
logp: torch.Tensor, shape=(-1,)
The log probability of each example.
"""
X = _check_parameter(_cast_as_tensor(X), "X", ndim=2,
check_parameter=self.check_data)
logps = torch.zeros(X.shape[0], device=X.device, dtype=torch.float32)
for i, distribution in enumerate(self.distributions):
parents = self._parents[i] + (i,)
X_ = X[:, parents]
if len(parents) > 1:
X_ = X_.unsqueeze(-1)
logps += distribution.log_probability(X_)
return logps
def predict(self, X):
"""Infers the maximum likelihood value for each missing value.
This method infers a probability distribution for each of the missing
values in the data. It uses the factor graph representation of the
Bayesian network to run the sum-product/loopy belief propagation
algorithm. After the probability distribution is inferred, the maximum
likeihood value for each variable is returned.
The input to this method must be a torch.masked.MaskedTensor where the
mask specifies which variables are observed (mask = True) and which ones
are not observed (mask = False) for each of the values. When setting
mask = False, it does not matter what the corresponding value in the
tensor is. Different sets of variables can be observed or missing in
different examples.
Unlike the `predict_proba` and `predict_log_proba` methods, this
method preserves the dimensions of the original data because it does
not matter how many categories a variable can take when you're only
returning the maximally likely one.
Parameters
----------
X: torch.masked.MaskedTensor, shape=(-1, d)
The data to predict values for. The mask should correspond to
whether the variable is observed in the example.
Returns
-------
y: torch.tensor, shape=(-1, d)
A completed version of the incomplete input tensor. The missing
variables are replaced with the maximally likely values from
the sum-product algorithm, and observed variables are kept.
"""
y = [t.argmax(dim=1) for t in self.predict_proba(X)]
return torch.vstack(y).T.contiguous()
def predict_proba(self, X):
"""Infers the probability of each category given the model and data.
This method infers a probability distribution for each of the missing
values in the data. It uses the factor graph representation of the
Bayesian network to run the sum-product/loopy belief propagation
algorithm.
The input to this method must be a torch.masked.MaskedTensor where the
mask specifies which variables are observed (mask = True) and which ones
are not observed (mask = False) for each of the values. When setting
mask = False, it does not matter what the corresponding value in the
tensor is. Different sets of variables can be observed or missing in
different examples.
An important note is that, because each variable can have a different
number of categories in the categorical setting, the return is a list
of tensors where each element in that list is the marginal probability
distribution for that variable. More concretely: the first element will
be the distribution of values for the first variable across all
examples. When the first variable has been provided as evidence, the
distribution will be clamped to the value provided as evidence.
..warning:: This inference is exact given a Bayesian network that has
a tree-like structure, but is only approximate for other cases. When
the network is acyclic, this procedure will converge, but if the graph
contains cycles then there is no guarantee on convergence.
Parameters
----------
X: torch.masked.MaskedTensor, shape=(-1, d)
The data to predict values for. The mask should correspond to
whether the variable is observed in the example.
Returns
-------
y: list of tensors, shape=(d,)
A list of tensors where each tensor contains the distribution of
values for that dimension.
"""
return self._factor_graph.predict_proba(X)
def predict_log_proba(self, X):
"""Infers the probability of each category given the model and data.
This method is a wrapper around the `predict_proba` method and simply
takes the log of each returned tensor.
This method infers a log probability distribution for each of the
missing values in the data. It uses the factor graph representation of
the Bayesian network to run the sum-product/loopy belief propagation
algorithm.
The input to this method must be a torch.masked.MaskedTensor where the
mask specifies which variables are observed (mask = True) and which ones
are not observed (mask = False) for each of the values. When setting
mask = False, it does not matter what the corresponding value in the
tensor is. Different sets of variables can be observed or missing in
different examples.
An important note is that, because each variable can have a different
number of categories in the categorical setting, the return is a list
of tensors where each element in that list is the marginal probability
distribution for that variable. More concretely: the first element will
be the distribution of values for the first variable across all
examples. When the first variable has been provided as evidence, the
distribution will be clamped to the value provided as evidence.
..warning:: This inference is exact given a Bayesian network that has
a tree-like structure, but is only approximate for other cases. When
the network is acyclic, this procedure will converge, but if the graph
contains cycles then there is no guarantee on convergence.
Parameters
----------
X: torch.masked.MaskedTensor, shape=(-1, d)
The data to predict values for. The mask should correspond to
whether the variable is observed in the example.
Returns
-------
y: list of tensors, shape=(d,)
A list of tensors where each tensor contains the distribution of
values for that dimension.
"""
return [torch.log(t) for t in self.predict_proba(X)]
def fit(self, X, sample_weight=None):
"""Fit the model to optionally weighted examples.
This method will fit the provided distributions given the data and
their weights. If a structure is provided as a set of edges, then
this will use maximum likelihood estimates to fit each of those
distributions. If no structure is provided, this will use the
structure learning algorithm provided to jointly learn the structure
and the parameters.
Parameters
----------
X: list, tuple, numpy.ndarray, torch.Tensor, shape=(-1, self.d)
A set of examples to evaluate.
sample_weight: list, tuple, numpy.ndarray, torch.Tensor, optional
A set of weights for the examples. This can be either of shape
(-1, self.d) or a vector of shape (-1,). Default is ones.
Returns
-------
self
"""
if self.algorithm is not None:
self.structure = _learn_structure(X, sample_weight=sample_weight,
algorithm=self.algorithm,
include_parents=self.include_parents,
exclude_parents=self.exclude_parents,
max_parents=self.max_parents, pseudocount=self.pseudocount)
if self.structure is not None:
distributions = _from_structure(X, sample_weight, self.structure)
self.add_distributions(distributions)
for i, parents in enumerate(self.structure):
if len(parents) > 0:
for parent in parents:
self.add_edge(distributions[parent], distributions[i])
self.summarize(X, sample_weight=sample_weight)
self.from_summaries()
return self
def summarize(self, X, sample_weight=None):
"""Extract the sufficient statistics from a batch of data.
This method calculates the sufficient statistics from optionally
weighted data and adds them to the stored cache for each distribution
in the network. Sample weights can either be provided as one
value per example or as a 2D matrix of weights for each feature in
each example.
Parameters
----------
X: list, tuple, numpy.ndarray, torch.Tensor, shape=(-1, len, self.d)
A set of examples to summarize.
sample_weight: list, tuple, numpy.ndarray, torch.Tensor, optional
A set of weights for the examples. This can be either of shape
(-1, self.d) or a vector of shape (-1,). Default is ones.
Returns
-------
logp: torch.Tensor, shape=(-1,)
The log probability of each example.
"""
if self.frozen:
return
if not self._initialized:
self._initialize(len(X[0]))
X, sample_weight = super().summarize(X, sample_weight=sample_weight)
X = _check_parameter(X, "X", min_value=0, ndim=2,
check_parameter=self.check_data)
for i, distribution in enumerate(self.distributions):
parents = self._parents[i] + (i,)
w = sample_weight[:, i]
if len(parents) == 1:
distribution.summarize(X[:, parents], sample_weight=w)
else:
distribution.summarize(X[:, parents].unsqueeze(-1),
sample_weight=w)
def from_summaries(self):
"""Update the model parameters given the extracted statistics.
This method uses calculated statistics from calls to the `summarize`
method to update the distribution parameters. Hyperparameters for the
update are passed in at initialization time.
Note: Internally, a call to `fit` is just a successive call to the
`summarize` method followed by the `from_summaries` method.
"""
if self.frozen:
return
for distribution in self.distributions:
distribution.from_summaries()
if isinstance(distribution, ConditionalCategorical):
p = torch.clone(distribution.probs[0])
p /= torch.prod(torch.tensor(p.shape[:-1]))
self._factor_mapping[distribution].probs = _cast_as_parameter(p)
#####
def _from_structure(X, sample_weight=None, structure=None, pseudocount=0.0):
"""Fits a set of distributions to data given the structure.
Given the structure, create the distribution objects and fit their
parameters to the given data. This does not perform structure learning.
Parameters
----------
X: list, tuple, numpy.ndarray, torch.Tensor, shape=(-1, self.d)
A set of examples to evaluate.
sample_weight: list, tuple, numpy.ndarray, torch.Tensor, or None
A set of weights for the examples. This can be either of shape
(-1, self.d) or a vector of shape (-1,). Default is ones.
structure: tuple
A tuple of tuples where the internal tuples indicate the parents of
that variable.
pseudocount: double
A pseudocount to add to each count. Default is 0.
Returns
-------
model: pomegranate.bayesian_network.BayesianNetwork
The fit Bayesian network.
"""
X = _check_parameter(_cast_as_tensor(X), "X", min_value=0, ndim=2,
dtypes=(torch.int32, torch.int64))
sample_weight = _check_parameter(_cast_as_tensor(sample_weight), "X",
min_value=0, ndim=1)
n, d = X.shape
if sample_weight is None:
sample_weight = torch.ones(n, dtype=torch.float32, device=X.device)
if structure is None:
structure = tuple([tuple() for i in range(d)])
model = BayesianNetwork()
if X.device != model.device:
model.to(X.device)
d = len(structure)
distributions = []
for i, parents in enumerate(structure):
if len(parents) == 0:
d = Categorical()
if d.device != X.device:
d.to(X.device)
d.fit(X[:, i:i+1], sample_weight=sample_weight)
else:
parents = parents + (i,)
d = ConditionalCategorical()
if d.device != X.device:
d.to(X.device)
d.fit(X[:, parents].unsqueeze(-1), sample_weight=sample_weight)
distributions.append(d)
return distributions
def _learn_structure(X, sample_weight=None, algorithm='chow-liu',
include_parents=None, exclude_parents=None, max_parents=None,
pseudocount=0, penalty=None, root=0):
"""Learn the structure of a Bayesian network using data.
This function will take in data, an algorithm, and parameters for that
algorithm, and will return the learned structure. Currently supported
algorithms are:
- 'chow-liu': Learn a maximal spanning tree that is the most likely
tree given the data and weights.
- 'exact': A dynamic programming solution to the exact BNSL task that
reduces the time from super-exponential to simply-exponential.
Parameters
----------
X: torch.tensor, shape=(n, d)
The data to fit the structure too, where each row is a sample and
each column corresponds to the associated variable.
sample_weight: torch.tensor, shape=(n,)
The weight of each sample as a positive double. If None, all items are
weighted equally.
algorithm: str
The name of the algorithm to use for structure learning. Must be one
of 'chow-liu' or 'exact'.
include_parents: list or None
A list of tuples where each inner tuple is made up of integers
indicating the parents that must exist for that variable. For example,
when passing in [(1,), (), ()], the first variable must have the second
variable as a parent, and potentially others if learned, and the others
do not have any parents that must be present. If None, no parents are
forced. Default is None.
exclude_parents: list or None
A list of tuples where each inner tuple is made up of integers
indicating the parents that cannot exist for that variable. For example,
when passing in [(1,), (), ()], the first variable cannot have the
second variable as a parent, and the other variables have no
restrictions on it. If None, no parents are excluded. Default is None.
max_parents: int or None
The maximum number of parents a variable can have. If used, this means
using the k-learn procedure. Can drastically speed up algorithms.
If None, no max on parents. Default is None.
pseudocount: double
A pseudocount to add to each count. Default is 0.
penalty: float or None, optional
The weighting of the model complexity term in the objective function.
Increasing this value will encourage sparsity whereas setting the value
to 0 will result in an unregularized structure. Default is
log2(|D|) / 2 where |D| is the sum of the weights of the data.
root: int
When using a tree-based algorithm, like Chow-Liu, sets the variable
that is the root of the tree.
Returns
-------
structure: tuple, shape=(d,)
A tuple of tuples, where each inner tuple represents a dimension in
the data and contains the parents of that dimension.
"""
X = _check_parameter(_cast_as_tensor(X), "X", min_value=0, ndim=2,
dtypes=(torch.int32, torch.int64))
sample_weight = _check_parameter(_cast_as_tensor(sample_weight), "X",
min_value=0, ndim=1)
if sample_weight is None:
sample_weight = torch.ones(X.shape[0], dtype=torch.float32,
device=X.device)
if algorithm == 'chow-liu':
structure = _categorical_chow_liu(X, sample_weight=sample_weight,
pseudocount=pseudocount, root=root)
elif algorithm == 'exact':
structure = _categorical_exact(X, sample_weight=sample_weight,
include_parents=include_parents, exclude_parents=exclude_parents,
max_parents=max_parents, pseudocount=pseudocount)
return structure
def _categorical_chow_liu(X, sample_weight=None, pseudocount=0.0, root=0):
"""An internal function for calculating a Chow-Liu tree.
This function calculates a Chow-Liu tree on categorical data which is
potentially weighted. A Chow-Liu tree is essentially a maximum spanning
tree on the information content that adding each new variable might
add.
Parameters
----------
X: torch.tensor, shape=(n, d)
The data to fit the structure too, where each row is a sample and
each column corresponds to the associated variable.
sample_weight: torch.tensor or None, shape=(n,)
The weight of each sample as a positive double. If None, all items are
weighted equally.
pseudocount: double
A pseudocount to add to each count. Default is 0.
root: int
When using a tree-based algorithm, like Chow-Liu, sets the variable
that is the root of the tree.
Returns
-------
structure: tuple
A tuple of tuples where the internal tuples indicate the parents of
that variable.
"""
X = _check_parameter(_cast_as_tensor(X), "X", min_value=0, ndim=2,
dtypes=(torch.int32, torch.int64))
sample_weight = _check_parameter(_cast_as_tensor(sample_weight), "X",
min_value=0, ndim=1)
pseudocount = _check_parameter(pseudocount, "pseudocount", min_value=0,
ndim=0)
root = _check_parameter(root, "root", min_value=0, max_value=X.shape[1],
dtypes=(int, numpy.int32, numpy.int64, torch.int32, torch.int64))
n, d = X.shape
if sample_weight is None:
sample_weight = torch.ones(n)
start = time.time()
n_categories = tuple(x.item() for x in X.max(dim=0).values + 1)
max_categories = max(n_categories)
c = max_categories ** 2
dtype = sample_weight.dtype
count = torch.empty(c, d, dtype=dtype, device=X.device)
mutual_info = torch.zeros(d, d, dtype=dtype, device=X.device)
marginals_i = torch.empty(d, c, dtype=dtype, device=X.device)
marginals_r = torch.empty(d, c, dtype=dtype, device=X.device)
for j in range(d):
marg = torch.zeros(max_categories, dtype=dtype, device=X.device)
marg.scatter_add_(0, X[:, j], sample_weight)
marginals_i[j] = marg.repeat_interleave(max_categories)
marginals_r[j] = marg.repeat(max_categories)
w = sample_weight.unsqueeze(-1).expand(-1, d)
for j in range(d):
X_j = X[:, j:j+1] * max_categories + X
m = marginals_i[j:j+1] * marginals_r
count[:] = pseudocount
count.scatter_add_(0, X_j, w)
mutual_info[j] -= torch.sum(count * torch.log(count / m.T), dim=0)
mutual_info[:, j] = mutual_info[j]
structure = [[] for i in range(d)]
visited = [root]
unvisited = list(range(d))
unvisited.remove(root)
for i in range(d-1):
min_score = float("inf")
min_x = -1
min_y = -1
idx = mutual_info[visited][:, unvisited].argmin()
row, col = (idx // len(unvisited)).item(), (idx % len(unvisited)).item()
min_x, min_y = visited[row], unvisited[col]
structure[min_y].append(min_x)
visited.append(min_y)
unvisited.remove(min_y)
return tuple(tuple(x) for x in structure)
def _categorical_exact(X, sample_weight=None, include_parents=None,
exclude_parents=None, pseudocount=0, penalty=None, max_parents=None):
"""Find the optimal graph over a set of variables with no other knowledge.
This is the naive dynamic programming structure learning task where the
optimal graph is identified from a set of variables using an order graph
and parent graphs. This can be used either when no constraint graph is
provided or for a SCC which is made up of a node containing a self-loop.
This is a reference implementation that uses the naive shortest path
algorithm over the entire order graph. The 'exact' option uses the A* path
in order to avoid considering the full order graph.
Parameters
----------
X: numpy.ndarray, shape=(n, d)
The data to fit the structure too, where each row is a sample and
each column corresponds to the associated variable.
sample_weight: numpy.ndarray, shape=(n,)
The weight of each sample as a positive double. Default is None.
include_parents: list or None
A set of (parent, child) tuples where each tuple is an edge that
must exist in the found structure.
exclude_parents: list or None
A set of (parent, child) tuples where each tuple is an edge that
cannot exist in the found structure.
penalty: float or None, optional
The weighting of the model complexity term in the objective function.
Increasing this value will encourage sparsity whereas setting the value
to 0 will result in an unregularized structure. Default is
log2(|D|) / 2 where |D| is the sum of the weights of the data.
max_parents: int
The maximum number of parents a node can have. If used, this means
using the k-learn procedure. Can drastically speed up algorithms.
If -1, no max on parents. Default is -1.
Returns
-------
structure: tuple, shape=(d,)
The parents for each variable in this SCC
"""
X = _check_parameter(_cast_as_tensor(X), "X", min_value=0, ndim=2,
dtypes=(torch.int32, torch.int64))
sample_weight = _check_parameter(_cast_as_tensor(sample_weight), "X",
min_value=0, ndim=1)
n, d = X.shape
n_categories = X.max(axis=0).values + 1
if sample_weight is None:
sample_weight = torch.ones(n)
w = sample_weight.sum()
if max_parents is None:
max_parents = int(torch.log2(2 * w / torch.log2(w)).item())
parent_graphs = []
for i in range(d):
exclude = None if exclude_parents is None else exclude_parents[i]
parent_set = tuple(set(range(d)) - set([i]))
parent_graph = _generate_parent_graph(X, sample_weight=sample_weight,
n_categories=n_categories, column_idx=i,
include_parents=include_parents, exclude_parents=exclude,
pseudocount=pseudocount, penalty=penalty, max_parents=max_parents,
parent_set=parent_set)
parent_graphs.append(parent_graph)
order_graph = nx.DiGraph()
for i in range(d+1):
for subset in itertools.combinations(range(d), i):
order_graph.add_node(subset)
for variable in subset:
parent = tuple(v for v in subset if v != variable)
structure, weight = parent_graphs[variable][parent]
weight = -weight if weight < 0 else 0
order_graph.add_edge(parent, subset, weight=weight,
structure=structure)
path = sorted(nx.all_shortest_paths(order_graph, source=(),
target=tuple(range(d)), weight="weight"))[0]
score, structure = 0, list( None for i in range(d) )
for u, v in zip(path[:-1], path[1:]):
idx = list(set(v) - set(u))[0]
parents = order_graph.get_edge_data(u, v)['structure']
structure[idx] = parents
score -= order_graph.get_edge_data(u, v)['weight']
return tuple(structure)
def _generate_parent_graph(X, sample_weight, n_categories, column_idx,
include_parents=None, exclude_parents=None, pseudocount=0, penalty=None,
max_parents=None, parent_set=None):
"""Generate a parent graph for a single variable over its parents.
This will generate the parent graph for a single parents given the data.
A parent graph is the dynamically generated best parent set and respective
score for each combination of parent variables. For example, if we are
generating a parent graph for x1 over x2, x3, and x4, we may calculate that
having x2 as a parent is better than x2,x3 and so store the value
of x2 in the node for x2,x3.
Parameters
----------
X: list, numpy.ndarray, torch.tensor, shape=(n, d)
The data to fit the structure too, where each row is a sample and
each column corresponds to the associated variable.
weights: list, numpy.ndarray, torch.tensor, shape=(n,)
The weight of each sample as a positive double. Default is None.
n_categories: list, numpy.ndarray, torch.tensor, shape=(d,)
The number of unique keys in each column.
column_idx: int
The column index to build the parent graph for.
include_parents: list or tuple or None
A set of integers indicating the parents that must be included in the
returned structure. Default is None.