我正在尝试在 Databricks 环境中的示例数据帧上运行 Pandas 分析。收到与 marplotlib 相关的错误,不确定此问题是否与 Matplotlib 或 pandas-profiling 有关。任何帮助将不胜感激。
Databricks 运行时配置: 7.4 ML(包括 Apache Spark 3.0.1、Scala 2.12)
像这样安装
!pip install pandas-profiling[notebook]
代码
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='EDA Report', explorative=True)
profile.to_file("/dbfs/mnt/sb2/EDA_Reports/EDA.html")
错误日志跟踪
Summarize dataset: 93%|█████████▎| 106/114 [11:30<07:27, 55.91s/it, Calculate cramers correlation]/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/correlations.py:139: UserWarning: There was an attempt to calculate the cramers correlation, but this failed.
To hide this warning, disable the calculation
(using `df.profile_report(correlations={"cramers": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'No data; `observed` has size 0.')
(include the error message: '{error}')"""
Summarize dataset: 94%|█████████▍| 107/114 [11:56<00:46, 6.69s/it, Get scatter matrix]
RuntimeError: "/databricks/python/lib/python3.7/site-packages/matplotlib/mpl-data" should be a path but it does not exist
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/context.py in manage_matplotlib_context()
79 sns.set_style(style="white")
---> 80 yield
81 finally:
/databricks/python/lib/python3.7/contextlib.py in inner(*args, **kwds)
73 with self._recreate_cm():
---> 74 return func(*args, **kwds)
75 return inner
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/plot.py in scatter_pairwise(series1, series2, x_label, y_label)
276 plt.scatter(series1, series2, color=color)
--> 277 return plot_360_n0sc0pe(plt)
278
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/utils.py in plot_360_n0sc0pe(plt, image_format, attempts)
67 image_str = StringIO()
---> 68 plt.savefig(image_str, format=image_format)
69 image_str.seek(0)
/databricks/python/lib/python3.7/site-packages/matplotlib/pyplot.py in savefig(*args, **kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/figure.py in savefig(self, fname, transparent, **kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, **kwargs)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in _get_output_canvas(self, fmt)
/databricks/python/lib/python3.7/site-packages/matplotlib/backend_bases.py in get_registered_canvas_class(format)
/databricks/python/lib/python3.7/importlib/__init__.py in import_module(name, package)
126 level += 1
--> 127 return _bootstrap._gcd_import(name[level:], package, level)
128
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _gcd_import(name, package, level)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_and_load(name, import_)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)
/databricks/python/lib/python3.7/importlib/_bootstrap.py in _find_spec(name, path, target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in find_spec(cls, fullname, path, target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in _get_spec(cls, fullname, path, target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in find_spec(self, fullname, target)
/databricks/python/lib/python3.7/importlib/_bootstrap_external.py in _fill_cache(self)
OSError: [Errno 116] Stale file handle: '/databricks/python/lib/python3.7/site-packages/matplotlib/backends'
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<command-3404575914441933> in <module>
1 profile = ProfileReport(df, title='EDA Report', explorative=True)
----> 2 profile.to_file("/dbfs/mnt/sb2/naga/dataset/EDA_Reports/Digital_HO_New_Features_EDA.html")
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in to_file(self, output_file, silent)
272 create_html_assets(output_file)
273
--> 274 data = self.to_html()
275
276 if output_file.suffix != ".html":
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in to_html(self)
376
377 """
--> 378 return self.html
379
380 def to_json(self) -> str:
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in html(self)
195 def html(self):
196 if self._html is None:
--> 197 self._html = self._render_html()
198 return self._html
199
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in _render_html(self)
302 from pandas_profiling.report.presentation.flavours import HTMLReport
303
--> 304 report = self.report
305
306 disable_progress_bar = not config["progress_bar"].get(bool)
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in report(self)
189 def report(self):
190 if self._report is None:
--> 191 self._report = get_report_structure(self.description_set)
192 return self._report
193
/databricks/python/lib/python3.7/site-packages/pandas_profiling/profile_report.py in description_set(self)
169 if self._description_set is None:
170 self._description_set = describe_df(
--> 171 self.title, self.df, self.summarizer, self.typeset, self._sample
172 )
173 return self._description_set
/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/describe.py in describe(title, df, summarizer, typeset, sample)
105 # Scatter matrix
106 pbar.set_postfix_str("Get scatter matrix")
--> 107 scatter_matrix = get_scatter_matrix(df, interval_columns)
108 pbar.update()
109
/databricks/python/lib/python3.7/site-packages/pandas_profiling/model/summary.py in get_scatter_matrix(df, continuous_variables)
283 df_temp = df[[x, y]].dropna()
284 scatter_matrix[x][y] = scatter_pairwise(
--> 285 df_temp[x], df_temp[y], x, y
286 )
287 else:
/databricks/python/lib/python3.7/contextlib.py in inner(*args, **kwds)
72 def inner(*args, **kwds):
73 with self._recreate_cm():
---> 74 return func(*args, **kwds)
75 return inner
76
/databricks/python/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
/databricks/python/lib/python3.7/site-packages/pandas_profiling/visualisation/context.py in manage_matplotlib_context()
83 with warnings.catch_warnings():
84 warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)
---> 85 matplotlib.rcParams.update(originalRcParams) # revert to original rcParams
/databricks/python/lib/python3.7/_collections_abc.py in update(*args, **kwds)
839 if isinstance(other, Mapping):
840 for key in other:
--> 841 self[key] = other[key]
842 elif hasattr(other, "keys"):
843 for key in other.keys():
/databricks/python/lib/python3.7/site-packages/matplotlib/__init__.py in __setitem__(self, key, val)
/databricks/python/lib/python3.7/site-packages/matplotlib/rcsetup.py in validate_path_exists(s)
RuntimeError: "/databricks/python/lib/python3.7/site-packages/matplotlib/mpl-data" should be a path but it does not exist