这是使用 NumPy 的另一种方式。它更快,因为它在整个底层数组上使用 NumPy 函数,而不是单独将 Python 函数应用于每一行:
import io
import pandas as pd
import numpy as np
content = '''\
Jan Feb Mar Apr May June
0.349143 0.249041 0.244352 NaN 0.425336 NaN
0.530616 0.816829 NaN 0.212282 0.099364 NaN
0.713001 0.073601 0.242077 0.553908 NaN NaN
0.245295 0.007016 0.444352 0.515705 0.497119 NaN
0.195662 0.007249 NaN 0.852287 NaN NaN'''
df = pd.read_table(io.BytesIO(content), sep='\s+')
def remove_rows_with_holes(df):
nans = np.isnan(df.values)
# print(nans)
# [[False False False True False True]
# [False False True False False True]
# [False False False False True True]
# [False False False False False True]
# [False False True False True True]]
# First index (per row) which is a NaN
nan_index = np.argmax(nans, axis=1)
# print(nan_index)
# [3 2 4 5 2]
# Last index (per row) which is not a NaN
h, w = nans.shape
not_nan_index = w - np.argmin(np.fliplr(nans), axis=1)
# print(not_nan_index)
# [5 5 4 5 4]
mask = nan_index >= not_nan_index
# print(mask)
# [False False True True False]
# print(df[mask])
# Jan Feb Mar Apr May June
# 2 0.713001 0.073601 0.242077 0.553908 NaN NaN
# 3 0.245295 0.007016 0.444352 0.515705 0.497119 NaN
return df[mask]
def holey(s):
starts_at = s.notnull().argmax()
next_null = s[starts_at:].isnull().argmax()
if next_null == 0:
return False
any_values_left = s[next_null:].notnull().any()
return any_values_left
def remove_using_holey(df):
mask = df.apply(holey, axis=1)
return df[~mask]
以下是 timeit 结果:
In [78]: %timeit remove_using_holey(df)
1000 loops, best of 3: 1.53 ms per loop
In [79]: %timeit remove_rows_with_holes(df)
10000 loops, best of 3: 85.6 us per loop
随着 DataFrame 中行数的增加,差异变得更加显着:
In [85]: df = pd.concat([df]*100)
In [86]: %timeit remove_using_holey(df)
1 loops, best of 3: 1.29 s per loop
In [87]: %timeit remove_rows_with_holes(df)
1000 loops, best of 3: 440 us per loop
In [88]: 1.29 * 10**6 / 440
Out[88]: 2931.818181818182