为了扩展@Ian Sudbury的答案,我将其扩展为可以通过将方法绑定到 DataFrame 类直接在数据帧上使用它(我希望我的代码在速度上肯定会有一些改进,因为我不知道如何访问类的所有内部)。
我还添加了面向后向窗口和居中窗口的功能。只有当您远离边缘时,它们才能完美运行。
import pandas as pd
import numpy as np
def roll_by(self, basis, window, func, forward=True, *args, **kwargs):
the_indexed = pd.Index(self[basis])
def apply_to_window(val):
if forward == True:
indexer = the_indexed.slice_indexer(val, val+window)
elif forward == False:
indexer = the_indexed.slice_indexer(val-window, val)
elif forward == 'both':
indexer = the_indexed.slice_indexer(val-window/2, val+window/2)
else:
raise RuntimeError('Invalid option for "forward". Can only be True, False, or "both".')
chunck = self.iloc[indexer]
return func(chunck, *args, **kwargs)
rolled = self[basis].apply(apply_to_window)
return rolled
pd.DataFrame.roll_by = roll_by
对于其他测试,我使用了以下定义:
def rollBy_Ian_iloc(what,basis,window,func,*args,**kwargs):
#note that basis must be sorted in order for this to work properly
indexed_what = pd.Series(what.values,index=basis.values)
def applyToWindow(val):
# using slice_indexer rather that what.loc [val:val+window] allows
# window limits that are not specifically in the index
indexer = indexed_what.index.slice_indexer(val,val+window,1)
chunk = indexed_what.iloc[indexer]
return func(chunk,*args,**kwargs)
rolled = basis.apply(applyToWindow)
return rolled
def rollBy_Ian_index(what,basis,window,func,*args,**kwargs):
#note that basis must be sorted in order for this to work properly
indexed_what = pd.Series(what.values,index=basis.values)
def applyToWindow(val):
# using slice_indexer rather that what.loc [val:val+window] allows
# window limits that are not specifically in the index
indexer = indexed_what.index.slice_indexer(val,val+window,1)
chunk = indexed_what[indexed_what.index[indexer]]
return func(chunk,*args,**kwargs)
rolled = basis.apply(applyToWindow)
return rolled
def rollBy_Bren(what, basis, window, func):
def applyToWindow(val):
chunk = what[(val<=basis) & (basis<val+window)]
return func(chunk)
return basis.apply(applyToWindow)
时间和测试:
df = pd.DataFrame({"RollBasis":np.random.uniform(0,100000,10000), "ToRoll": np.random.uniform(0,10,10000)}).sort_values("RollBasis")
In [14]: %timeit rollBy_Ian_iloc(df.ToRoll,df.RollBasis,10,sum)
...: %timeit rollBy_Ian_index(df.ToRoll,df.RollBasis,10,sum)
...: %timeit rollBy_Bren(df.ToRoll,df.RollBasis,10,sum)
...: %timeit df.roll_by('RollBasis', 10, lambda x: x['ToRoll'].sum())
...:
484 ms ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.58 s ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.12 s ± 22.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.48 s ± 45.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
结论:绑定的方法不如@Ian Sudbury的方法快,但也没有@BrenBarn的慢,但它确实为可以调用它们的函数提供了更大的灵活性。