对于性能,我建议 NumPy -
def mask_variable_largest_per_row(df, n_max):
a = df.values
m,n = a.shape
nan_row_count = np.isnan(a).sum(1)
n_reset = n-n_max.values-nan_row_count
n_reset.clip(min=0, max=n-1, out = n_reset)
sidx = a.argsort(1)
mask = n_reset[:,None] > np.arange(n)
c = sidx[mask]
r = np.repeat(np.arange(m), n_reset)
a[r,c] = np.nan
return df
样品运行 -
In [182]: df
Out[182]:
a b c d e
row1 1.0 2.0 NaN 4.0 5.0
row2 11.0 12.0 13.0 14.0 NaN
row3 22.0 22.0 5.0 24.0 25.0
In [183]: n_max = pd.Series([2,3,2])
In [184]: mask_variable_largest_per_row(df, n_max)
Out[184]:
a b c d e
row1 NaN NaN NaN 4.0 5.0
row2 NaN 12.0 13.0 14.0 NaN
row3 NaN NaN NaN 24.0 25.0
进一步提升:引入numpy.argpartition
替换numpy.argsort
应该会有所帮助,因为我们不关心要重置为的索引的顺序NaNs
。因此,一个numpy.argpartition
基于的将是 -
def mask_variable_largest_per_row_v2(df, n_max):
a = df.values
m,n = a.shape
nan_row_count = np.isnan(a).sum(1)
n_reset = n-n_max.values-nan_row_count
n_reset.clip(min=0, max=n-1, out = n_reset)
N = (n-n_max.values).max()
N = np.clip(N, a_min=0, a_max=n-1)
sidx = a.argpartition(N, axis=1) #sidx = a.argsort(1)
mask = n_reset[:,None] > np.arange(n)
c = sidx[mask]
r = np.repeat(np.arange(m), n_reset)
a[r,c] = np.nan
return df
运行时测试
其他方法 -
def pandas_rank_based(df, n_max):
n_max.index = df.index
df_rank = df.stack(dropna=False).groupby(level=0).rank\
(ascending=False, method='first').unstack()
selected = df_rank.le(n_max, axis=0)
return df[selected]
验证和时间安排 -
In [387]: arr = np.random.rand(1000,1000)
...: arr.ravel()[np.random.choice(arr.size, 10000, replace=0)] = np.nan
...: df1 = pd.DataFrame(arr)
...: df2 = df1.copy()
...: df3 = df1.copy()
...: n_max = pd.Series(np.random.randint(0,1000,(1000)))
...:
...: out1 = pandas_rank_based(df1, n_max)
...: out2 = mask_variable_largest_per_row(df2, n_max)
...: out3 = mask_variable_largest_per_row_v2(df3, n_max)
...: print np.nansum(out1-out2)==0 # Verify
...: print np.nansum(out1-out3)==0 # Verify
...:
True
True
In [388]: arr = np.random.rand(1000,1000)
...: arr.ravel()[np.random.choice(arr.size, 10000, replace=0)] = np.nan
...: df1 = pd.DataFrame(arr)
...: df2 = df1.copy()
...: df3 = df1.copy()
...: n_max = pd.Series(np.random.randint(0,1000,(1000)))
...:
In [389]: %timeit pandas_rank_based(df1, n_max)
1 loops, best of 3: 559 ms per loop
In [390]: %timeit mask_variable_largest_per_row(df2, n_max)
10 loops, best of 3: 34.1 ms per loop
In [391]: %timeit mask_variable_largest_per_row_v2(df3, n_max)
100 loops, best of 3: 5.92 ms per loop
50x+
与内置的 pandas相比,那里的加速非常好!