python - 使用 pandas_profiling profile_report 时出现 MemoryError

Question

我正在尝试分析一个 excel 文件，它是一个非常小的数据集，只有 30 列和 535 行，但是当我运行 profile_report 函数时，它每次都会以不同的百分比停止，但总是有相同的消息：

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
      1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
      2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
    276                 create_html_assets(output_file)
    277 
--> 278             data = self.to_html()
    279 
    280             if output_file.suffix != ".html":

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
    384 
    385         """
--> 386         return self.html
    387 
    388     def to_json(self) -> str:

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
    199     def html(self):
    200         if self._html is None:
--> 201             self._html = self._render_html()
    202         return self._html
    203 

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
    306         from pandas_profiling.report.presentation.flavours import HTMLReport
    307 
--> 308         report = self.report
    309 
    310         disable_progress_bar = not config["progress_bar"].get(bool)

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
    193     def report(self):
    194         if self._report is None:
--> 195             self._report = get_report_structure(self.description_set)
    196         return self._report
    197 

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
    172     def description_set(self):
    173         if self._description_set is None:
--> 174             self._description_set = describe_df(
    175                 self.title, self.df, self.summarizer, self.typeset, self._sample
    176             )

~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
     72         total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
     73     ) as pbar:
---> 74         series_description = get_series_descriptions(df, summarizer, typeset, pbar)
     75 
     76         pbar.set_postfix_str("Get variable types")

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
     97         # TODO: use `Pool` for Linux-based systems
     98         with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99             for i, (column, description) in enumerate(
    100                 executor.imap_unordered(multiprocess_1d, args)
    101             ):

~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
    866         if success:
    867             return value
--> 868         raise value
    869 
    870     __next__ = next                    # XXX

~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    123         job, i, func, args, kwds = task
    124         try:
--> 125             result = (True, func(*args, **kwds))
    126         except Exception as e:
    127             if wrap_exception and func is not _helper_reraises_exception:

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
     76         """
     77         column, series = args
---> 78         return column, describe_1d(series, summarizer, typeset)
     79 
     80     pool_size = config["pool_size"].get(int)

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
     50         vtype = typeset.detect_type(series)
     51 
---> 52     return summarizer.summarize(series, dtype=vtype)
     53 
     54 

~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
     54         """
     55         summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56         _, summary = summarizer_func(series, {"type": dtype})
     57         return summary
     58 

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     17     def func(f, g):
     18         def func2(*x):
---> 19             res = g(*x)
     20             if type(res) == bool:
     21                 return f(*x)

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
     70         if not summary["hashable"]:
     71             return series, summary
---> 72         return fn(series, summary)
     73 
     74     return inner

~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
     40                 return False
     41 
---> 42         return fn(series, state, *args, **kwargs)
     43 
     44     return inner

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
    208 
    209     if chi_squared_threshold > 0.0:
--> 210         stats["chi_squared"] = chi_square(finite_values)
    211 
    212     stats["range"] = stats["max"] - stats["min"]

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
    352 def chi_square(values=None, histogram=None):
    353     if histogram is None:
--> 354         histogram, _ = np.histogram(values, bins="auto")
    355     return dict(chisquare(histogram)._asdict())
    356 

<__array_function__ internals> in histogram(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
    790     a, weights = _ravel_and_check_weights(a, weights)
    791 
--> 792     bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
    793 
    794     # Histogram is an integer or a float array depending on the weights.

~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
    444 
    445         # bin edges must be computed
--> 446         bin_edges = np.linspace(
    447             first_edge, last_edge, n_equal_bins + 1,
    448             endpoint=True, dtype=bin_type)

<__array_function__ internals> in linspace(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
    126 
    127     delta = stop - start
--> 128     y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
    129     # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
    130     # from overriding what class is produced, and thus prevents, e.g. use of Quantities,

MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64

我在不同的 python 安装中运行了相同的代码，它运行良好。

提前谢谢大家，如果您需要更多信息，请告诉我。

score 1 · Accepted Answer

这是 numpy.histogram ( https://github.com/numpy/numpy/issues/10297 ) 中的一个错误，也报告了 SO ( Numpy histogram 在小数据集上非常慢)。

此错误是由对的调用引起的np.histogram(x, bin='auto')。当输入具有非常大的值时，“自动”方法可能会在尝试生成大量无法放入 ram 的 bin 时失败。

作为一种解决方法，您可以在生成报告之前手动删除较大的值。

python - 使用 pandas_profiling profile_report 时出现 MemoryError

1 回答 1

Related

Reference