Merge remote-tracking branch 'origin/master'

scikit-learn-contrib · Feb 8, 2022 · ec3b205 · ec3b205
2 parents c13be8a + 2179c24
commit ec3b205
Show file tree

Hide file tree

Showing 13 changed files with 1,189 additions and 530 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -0,0 +1,151 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+- master
+
+jobs:
+  - job: Linux
+    pool:
+      vmImage: ubuntu-latest
+    strategy:
+      matrix:
+        Python37:
+          python.version: '3.7'
+        Python38:
+          python.version: '3.8'
+        Python39:
+          python.version: '3.9'
+
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: '$(python.version)'
+        displayName: 'Use Python $(python.version)'
+
+      - script: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        displayName: 'Install dependencies'
+
+      - script: |
+          pip install cython
+          python setup.py develop
+
+      - script: |
+          pip install pytest pytest-azurepipelines
+          pytest
+        displayName: 'pytest'
+
+      - task: PublishTestResults@2
+        inputs:
+          testResultsFiles: 'pytest.xml'
+          testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+        condition: succeededOrFailed()
+
+  - job: Windows
+    pool:
+      vmImage: 'windows-latest'
+    strategy:
+      matrix:
+        Python36:
+          python.version: '3.6'
+        Python37:
+          python.version: '3.7'
+        Python38:
+          python.version: '3.8'
+        Python39:
+          python.version: '3.9'
+
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: '$(python.version)'
+        displayName: 'Use Python $(python.version)'
+
+      - script: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        displayName: 'Install dependencies'
+
+      - script: |
+          pip install cython
+          python setup.py develop
+
+      - script: |
+          pip install pytest pytest-azurepipelines
+          pytest
+        displayName: 'pytest'
+
+  - job: MacOS
+    pool:
+      vmImage: 'macos-latest'
+    strategy:
+      matrix:
+        Python37:
+          python.version: '3.7'
+        Python38:
+          python.version: '3.8'
+        Python39:
+          python.version: '3.9'
+
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: '$(python.version)'
+        displayName: 'Use Python $(python.version)'
+
+      - script: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        displayName: 'Install dependencies'
+
+      - script: |
+          pip install cython
+          python setup.py develop
+
+      - script: |
+          pip install pytest pytest-azurepipelines
+          pytest
+        displayName: 'pytest'
+
+  - job: Coverage
+    pool:
+      vmImage: ubuntu-latest
+    strategy:
+      matrix:
+        Python39:
+          python.version: '3.9'
+
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: '$(python.version)'
+        displayName: 'Use Python $(python.version)'
+
+      - script: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        displayName: 'Install dependencies'
+
+      - script: |
+          pip install cython
+          pip install pytest
+          pip install pytest-cov
+          pip install coveralls
+          pip install codecov
+          python setup.py develop
+
+      - script: |
+          pip install pytest pytest-azurepipelines
+          pytest hdbscan/tests --show-capture=no -v --disable-warnings --junitxml=pytest.xml --cov=hdbscan/ --cov-report=xml --cov-report=html
+          codecov
+        displayName: 'pytest'
+
+      - task: PublishTestResults@2
+        inputs:
+          testResultsFiles: 'pytest.xml'
+          testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+        condition: succeededOrFailed()
diff --git a/docs/dbscan_from_hdbscan.rst b/docs/dbscan_from_hdbscan.rst
@@ -0,0 +1,126 @@
+
+Extracting DBSCAN* clustering from HDBSCAN*
+===========================================
+
+There are a number of reasons that one might prefer `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`__'s
+clustering over that of HDBSCAN*.  The biggest difficulty many folks have with
+DBSCAN is that the epsilon distance parameter can be hard to determine and often
+requires a great deal of trial and error to tune.  If your data lived in a more
+interpretable space and you had a good notion of distance in that space this problem
+is certainly mitigated and a user might want to set a very specific epsilon distance
+for their use case.  Another viable use case might be that a user is interested in a
+constant density clustering.
+HDBSCAN* does variable density clustering by default, looking for the clusters that persist
+over a wide range of epsilon distance parameters to find a 'natural' clustering.  This might
+not be the right result for your application.  A DBSCAN clustering at a particular
+epsilon value might work better for your particular task.
+
+HDBSCAN returns a very natural clustering of your data which is often very useful in exploring
+a new data set.  That doesn't necessarily make it the right clustering algorithm or every
+task.
+
+HDBSCAN* can best be thought of as a DBSCAN* implementation which varies across
+all epsilon values and extracts the clusters that persist over the widest range
+of these parameter choices.  It is therefore able to ignore the parameter and
+only needs the minimum cluster size as single input parameter.
+The 'eom' (Excess of Mass) cluster selection method then returns clusters with the
+best stability over epsilon.
+
+There are a number of alternative ways of extracting a flat clustering from
+the HDBSCAN* hierarchical tree.  If one is interested in finer resolution
+clusters while still maintaining variable density one could set
+``cluster_selection_method='leaf'`` to extract the leaves of the condensed
+tree instead of the most persistent clusters.  For more details on these
+cluster selection methods see :ref:`leaf_clustering_label`.
+
+If one wasn't interested in the variable density clustering that is the hallmark of
+HDBSCAN* it is relatively easy to extract any DBSCAN* clustering from a
+single run of HDBSCAN*.  This has the advantage of allowing you to perform
+a single computationally efficient HDBSCAN* run and then quickly search over
+the DBSCAN* parameter space by extracting clustering results from our
+pre-constructed tree.  This can save significant computational time when
+searching across multiple cluster parameter settings on large amounts of data.
+
+Alternatively, one could make use of the ``cluster_selection_epsilon`` as a
+post processing step with any ``cluster_selection_method`` in order to
+return a hybrid clustering of DBSCAN* and HDBSCAN*.  For more details on
+this see :doc:`how_to_use_epsilon`.
+
+In order to extract a DBSCAN* clustering from an HDBSCAN run we must first train
+and HDBSCAN model on our data.
+
+.. code:: python
+
+    import hdbscan
+    h_cluster = hdbscan.HDBSCAN(min_samples=5,match_reference_implementation=True).fit(X)
+
+The ``min_cluster_size`` parameter is unimportant in this case in that it is
+only used in the creation of our condensed tree which we won't be using here.
+Now we choose a ``cut_distance`` which is just another name for the epsilon
+threshold in DBSCAN and will be passed to our
+:py:meth:`~hdbscan.hdbscan_.dbscan_clustering` method.
+
+.. code:: python
+
+    eps = 0.2
+    labels = h_cluster.dbscan_clustering(cut_distance=eps, min_cluster_size=5)
+    sns.scatterplot(x=X[:,0], y=X[:,1], hue=labels.astype(str));
+
+.. image:: images/dbscan_from_hdbscan_clustering.png
+    :align: center
+
+It should be noted that a DBSCAN* clustering extracted from our HDBSCAN* tree will
+not precisely match the clustering results from sklearn's DBSCAN implementation.
+Our clustering results should better match DBSCAN* (which can be thought of as
+DBSCAN without the border points).  As such when comparing the two results one
+should expect them to mostly differ in the points that DBSCAN considers boarder
+points.  We'll deal with
+this by only looking at the comparison of our clustering results based on the points identified
+by DBSCAN as core points.  We can see below that the differences between these two
+clusterings mostly occur in the boundaries of the clusters.  This matches our
+intuition of stability within the core points.
+
+.. image:: images/dbscan_from_hdbscan_comparision.png
+    :align: center
+
+For a slightly more empirical comparison we we make use of the `adjusted rand score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html>`__
+to compare the clustering of the core points between a DBSCAN cluster from sklearn and
+a DBSCAN* clustering extracted from our HDBSCAN* object.
+
+.. image:: images/dbscan_from_hdbscan_percentage_core.png
+    :align: center
+
+.. image:: images/dbscan_from_hdbscan_number_of_clusters.png
+    :align: center
+
+We see that for very small epsilon values our number of clusters tends to be quite
+far apart, largely due to a large number of the points being considered boundary points
+instead of core points.  As the epsilon value increases, more and more points are
+considered core and the number of clusters generated by each algorithm converge.
+
+Additionally, the adjusted rand score between the core points of both algorithm
+stays consistently high (mostly 1.0) for our entire range of epsilon.  There may be
+be some minor discrepancies between core point results largely due to implementation
+details and optimizations with the code base.
+
+Why might one just extract the DBSCAN* clustering results from a single HDBSCAN* run
+instead of making use of sklearns DBSSCAN code?  The short answer is efficiency.
+If you aren't sure what epsilon parameter to select for DBSCAN then you may have to
+run the algorithm many times on your data set.  While those runs can be inexpensive for
+very small epsilon values they can get quite expensive for large parameter values.
+
+In this small benchmark case of 50,000 two dimensional data points we have broken even
+after having only had to try two epsilon parameters from DBSCAN, or only a single
+run with a large parameter selected.  This trend is only exacerbated for larger
+data sets in higher dimensional spaces.  For more detailed scaling experiments see
+`Accelearted Hierarchical Density Clustering <https://arxiv.org/abs/1705.07321>`__
+by McInnes and Healy.
+
+.. image:: images/dbscan_from_hdbscan_timing.png
+    :align: center
+
+
+
+
+
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -25,6 +25,7 @@ User Guide / Tutorial
    prediction_tutorial
    soft_clustering
    how_to_use_epsilon
+   dbscan_from_hdbscan
    faq
 
 Background on Clustering with HDBSCAN

diff --git a/hdbscan/_hdbscan_boruvka.pyx b/hdbscan/_hdbscan_boruvka.pyx
@@ -423,7 +423,7 @@ cdef class KDTreeBoruvkaAlgorithm (object):
                 else:
                     datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt]))
 
-            knn_data = Parallel(n_jobs=self.n_jobs)(
+            knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
                 delayed(_core_dist_query)
                 (self.core_dist_tree, points,
                  self.min_samples + 1)
@@ -454,8 +454,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
         # issues, but we'll get quite a few, and they are the hard ones to
         # get, so fill in any we can and then run update components.
         for n in range(self.num_points):
-            for i in range(1, self.min_samples + 1):
+            for i in range(0, self.min_samples + 1):
                 m = knn_indices[n, i]
+                if n == m:
+                    continue
                 if self.core_distance[m] <= self.core_distance[n]:
                     self.candidate_point[n] = n
                     self.candidate_neighbor[n] = m
@@ -745,7 +747,7 @@ cdef class KDTreeBoruvkaAlgorithm (object):
             # then propagate the results of that computation
             # up the tree.
             new_bound = min(new_upper_bound,
-                            new_lower_bound + 2 * node1_info.radius)
+                            new_lower_bound + 2 * self.dist._dist_to_rdist(node1_info.radius))
             # new_bound = new_upper_bound
             if new_bound < self.bounds_ptr[node1]:
                 self.bounds_ptr[node1] = new_bound
@@ -1025,36 +1027,39 @@ cdef class BallTreeBoruvkaAlgorithm (object):
                 else:
                     datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt]))
 
-            knn_data = Parallel(n_jobs=self.n_jobs)(
+            knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
                 delayed(_core_dist_query)
                 (self.core_dist_tree, points,
-                 self.min_samples)
+                 self.min_samples + 1)
                 for points in datasets)
             knn_dist = np.vstack([x[0] for x in knn_data])
             knn_indices = np.vstack([x[1] for x in knn_data])
         else:
             knn_dist, knn_indices = self.core_dist_tree.query(
                 self.tree.data,
-                k=self.min_samples,
+                k=self.min_samples + 1,
                 dualtree=True,
                 breadth_first=True)
 
-        self.core_distance_arr = knn_dist[:, self.min_samples - 1].copy()
+        self.core_distance_arr = knn_dist[:, self.min_samples].copy()
         self.core_distance = (<np.double_t[:self.num_points:1]> (
             <np.double_t *> self.core_distance_arr.data))
 
         # Since we already computed NN distances for the min_samples closest
         # points we can use this to do the first round of boruvka -- we won't
         # get every point due to core_distance/mutual reachability distance
         # issues, but we'll get quite a few, and they are the hard ones to get,
-        # so fill in any we ca and then run update components.
+        # so fill in any we can and then run update components.
         for n in range(self.num_points):
-            for i in range(self.min_samples - 1, 0):
+            for i in range(0, self.min_samples + 1):
                 m = knn_indices[n, i]
+                if n == m:
+                    continue
                 if self.core_distance[m] <= self.core_distance[n]:
                     self.candidate_point[n] = n
                     self.candidate_neighbor[n] = m
                     self.candidate_distance[n] = self.core_distance[n]
+                    break
 
         self.update_components()