我在 rapidsai docker 容器内将 rapids UMAP 与 HDBSCAN 结合使用:rapidsai/rapidsai-core:0.18-cuda11.0-runtime-ubuntu18.04-py3.7
import cudf
import cupy
from cuml.manifold import UMAP
import hdbscan
from sklearn.datasets import make_blobs
from cuml.experimental.preprocessing import StandardScaler
blobs, labels = make_blobs(n_samples=100000, n_features=10)
df_gpu=cudf.DataFrame(blobs)
scaler= StandardScaler()
cupy_scaled=scaler.fit_transform(df_gpu.values)
projector= UMAP(n_components=3, n_neighbors=2000)
cupy_projected=projector.fit_transform(cupy_scaled)
numpy_projected=cupy.asnumpy(cupy_projected)
clusterer= hdbscan.HDBSCAN(min_cluster_size=1000, prediction_data=True, gen_min_span_tree=True)#, core_dist_n_jobs=1)
clusterer.fit(numpy_projected)
我得到一个错误,如果我使用它会修复core_dist_n_jobs=1
但会使代码变慢:
-------------------------------------------------- ------------------------- 1 个集群中的 TerminatedWorkerError Traceback(最近一次调用最后一次)= hdbscan.HDBSCAN(min_cluster_size=1000, prediction_data=True, gen_min_span_tree =True) ----> 2 clusterer.fit(numpy_projected)
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in fit(self, X, y) 917 self._condensed_tree, 918 self._single_linkage_tree, --> 919 self._min_spanning_tree ) = hdbscan(X, **kwargs) 920 921 如果 self.prediction_data:
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree , core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs) 613 approx_min_span_tree, 614 gen_min_span_tree, --> 615 core_dist_n_jobs, **kwargs) 616 else: # Metric is a valid BallTree metric 617 # TO DO: 需要启发式来决定何时去博鲁夫卡;
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/memory.py in call (self, *args, **kwargs) 350 351 def call (self, *args, **kwargs) : --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs):
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in _hdbscan_boruvka_kdtree(X,min_samples,alpha,metric,p,leaf_size,approx_min_span_tree,gen_min_span_tree,core_dist_n_jobs,**kwargs)276 leaf_size=leaf_size // 3, 277 approx_min_span_tree=approx_min_span_tree, --> 278 n_jobs=core_dist_n_jobs, **kwargs) 279 min_spanning_tree = alg.spanning_tree() 280 # 按权重排序 min_spanning_tree 的边
hdbscan/_hdbscan_boruvka.pyx 在 hdbscan._hdbscan_boruvka.KDTreeBoruvka 算法中。初始化()
hdbscan/_hdbscan_boruvka.pyx in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm._compute_bounds()
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/parallel.py in call (self, iterable) 1052 1053 with self._backend.retrieval_context(): -> 1054 self.retrieve() 1055 # 确保我们收到最后一条消息,告诉我们完成了 1056
elapsed_time = time.time() - self._start_time/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 931 try: 932 if getattr(self._backend, 'supports_timeout', False): --> 933 self._output.extend(job.get(timeout=self.timeout)) 934 else: 935 self._output.extend(job.get())
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 540 AsyncResults.get from multiprocessing.""" 541 try: --> 542 return future .result(timeout=timeout) 543 除了 CfTimeoutError as e: 544 raise TimeoutError from e
/opt/conda/envs/rapids/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self. __get_result() 436 else: 437 引发 TimeoutError()
/opt/conda/envs/rapids/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else : 386 返回 self._result
TerminatedWorkerError:执行器管理的工作进程意外终止。这可能是由于调用函数时出现分段错误或内存使用过多导致操作系统杀死工作人员造成的。
工人的退出代码是 {EXIT(1)}
有没有办法解决这个问题,但仍然保持 HDBSCAN 快速?