我有需要计算 Pandas 多级数据帧的滚动和的应用程序,我想找到一种方法来缩短处理时间。
mul_df()
是创建演示多级数据框的功能。
import itertools
import numpy as np
import pandas as pd
def mul_df(level1_rownum, level2_rownum, col_num):
''' create multilevel dataframe '''
index_name = ['IDX_1','IDX_2']
col_name = ['COL'+str(x).zfill(3) for x in range(col_num)]
first_level_dt = [['A'+str(x).zfill(4)]*level2_rownum for x in range(level1_rownum)]
first_level_dt = list(itertools.chain(*first_level_dt))
second_level_dt = ['B'+str(x).zfill(3) for x in range(level2_rownum)]*level1_rownum
dt = pd.DataFrame(np.random.randn(level1_rownum*level2_rownum, col_num), columns=col_name)
dt[index_name[0]] = first_level_dt
dt[index_name[1]] = second_level_dt
rst = dt.set_index(index_name, drop=True, inplace=False)
return rst
例如 :
>>> df = mul_df(4,5,3)
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 0.2317 -0.6122 0.2289
B001 -0.9218 -0.2918 1.7295
B002 0.1368 0.6659 -1.9193
B003 0.3839 -0.8542 -0.3065
B004 2.0361 -0.4601 1.1246
A0001 B000 0.3039 -0.6761 1.3762
B001 1.1767 0.8465 -0.1745
B002 0.4937 1.6774 -0.3038
B003 -0.3627 -1.6413 -0.7373
B004 -0.0149 1.5900 0.3385
A0002 B000 0.0326 0.2637 1.7990
B001 -0.1071 0.6097 -0.2812
B002 -0.2199 0.7360 1.9425
B003 -1.0423 0.6763 -0.2479
B004 -0.9024 0.3016 -2.7585
A0003 B000 0.2550 0.0470 0.6849
B001 0.5986 0.3283 1.6327
B002 0.8929 -1.1128 -0.9495
B003 -0.5633 1.7935 0.1652
B004 1.0417 -0.4833 0.3413
并使用下面的命令来计算每列数据 groupby 'IDX_1' 的滚动总和(窗口大小为 4):
>>> df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -0.1694 -1.0923 -0.2675
B004 1.6350 -0.9402 0.6282
A0001 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.6116 0.2064 0.1606
B004 1.2928 2.4726 -0.8771
A0002 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -1.3367 2.2857 3.2125
B004 -2.2717 2.3236 -1.3451
A0003 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.1832 1.0559 1.5334
B004 1.9699 0.5256 1.1898
>>>
我尝试计算rolling_sum()
一个大数据框的。
In [1]: df = mul_df(1000,25,1000)
In [2]: timeit df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
1 loops, best of 3: 52.1 s per loop
(1000*25, 1000) 数据帧需要 52.1 秒。如何加快 rolling_sum 的速度(我的意思是有其他方法可以实现相同的计算结果但花费更少的时间)?
编辑(为waitingkuo的解决方案添加内存错误消息)
In [1]: df = mul_df(1000,25,1000)
In [2]: k2 = df.frs(4)
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-2-1b54b2662162> in <module>()
----> 1 k2 = df.frs(4)
F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in wrapped(*args, **kwargs)
149 from datetime import datetime
150 t1 = datetime.now()
--> 151 rst = fn(*args, **kwargs)
152 t2 = datetime.now()
153 print "Time: %0.3f"%((t2-t1).seconds + (t2-t1).microseconds/1000000.0)
F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in _frs(df, n)
864 ''' fast_rolling_sum , http://stackoverflow.com/questions/15652343/how-to-speed-up-pandas-rolling-sum '''
865 grp = df.groupby(level='STK_ID')
--> 866 return np.sum([grp.shift(i) for i in range(n)])
867 DataFrame.frs = _frs
868
D:\Python\lib\site-packages\pandas\core\groupby.pyc in wrapper(*args, **kwargs)
259 return self.apply(curried_with_axis)
260 except Exception:
--> 261 return self.apply(curried)
262
263 return wrapper
D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, func, *args, **kwargs)
320 func = _intercept_function(func)
321 f = lambda g: func(g, *args, **kwargs)
--> 322 return self._python_apply_general(f)
323
324 def _python_apply_general(self, f):
D:\Python\lib\site-packages\pandas\core\groupby.pyc in _python_apply_general(self, f)
323
324 def _python_apply_general(self, f):
--> 325 keys, values, mutated = self.grouper.apply(f, self.obj, self.axis)
326
327 return self._wrap_applied_output(keys, values,
D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, f, data, axis, keep_internal)
583 if hasattr(splitter, 'fast_apply') and axis == 0:
584 try:
--> 585 values, mutated = splitter.fast_apply(f, group_keys)
586 return group_keys, values, mutated
587 except lib.InvalidApply:
D:\Python\lib\site-packages\pandas\core\groupby.pyc in fast_apply(self, f, names)
2136 return [], True
2137
-> 2138 sdata = self._get_sorted_data()
2139 results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
2140
D:\Python\lib\site-packages\pandas\core\groupby.pyc in _get_sorted_data(self)
2103
2104 def _get_sorted_data(self):
-> 2105 return self.data.take(self.sort_idx, axis=self.axis)
2106
2107 def _chop(self, sdata, slice_obj):
D:\Python\lib\site-packages\pandas\core\frame.pyc in take(self, indices, axis)
2900 new_values = com.take_2d(self.values,
2901 com._ensure_int64(indices),
-> 2902 axis=axis)
2903 if axis == 0:
2904 new_columns = self.columns
D:\Python\lib\site-packages\pandas\core\common.pyc in take_2d(arr, indexer, out, mask, needs_masking, axis, fill_value)
426 elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
427 if out is None:
--> 428 out = np.empty(out_shape, dtype=arr.dtype)
429 take_f = _get_take2d_function(dtype_str, axis=axis)
430 take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
MemoryError:
In [3]: