虽然这不是纯粹的 pandas,但它非常快。NCLS用于在 < 5 秒内找到约 5000 万个重叠。
安装:
# pip install ncls
# or
# conda install -c bioconda ncls
设置:
import numpy as np
np.random.seed(0)
import pandas as pd
size = int(1e6)
dtype = np.int32
start = np.random.randint(int(1e7), size=size, dtype=dtype)
end = start + np.random.randint(int(1e3), size=size, dtype=dtype)
start2 = np.random.randint(int(1e7), size=size, dtype=dtype)
end2 = start2 + 1
intervals = pd.DataFrame({"Start": start, "End": end})
# Start End
# 0 8325804 8326332
# 1 1484405 1485343
# 2 2215104 2215531
# 3 5157699 5157834
# 4 8222403 8222497
# ... ... ...
# 999995 2981746 2982673
# 999996 1453668 1454251
# 999997 3325111 3325135
# 999998 4311711 4312465
# 999999 8089671 8090277
#
# [1000000 rows x 2 columns]
points = pd.DataFrame({"Start": start2, "End": end2})
# Start End
# 0 1714420 1714421
# 1 980607 980608
# 2 5566444 5566445
# 3 2788107 2788108
# 4 6145575 6145576
# ... ... ...
# 999995 1824809 1824810
# 999996 6135851 6135852
# 999997 5190341 5190342
# 999998 7403307 7403308
# 999999 9732498 9732499
#
# [1000000 rows x 2 columns]
执行:
from ncls import NCLS
n = NCLS(intervals.Start.values, intervals.End.values, intervals.index.values)
# Wall time: 421 ms
p_ix, i_ix = n.all_overlaps_both(points.Start.values, points.End.values, points.index.values)
# Wall time: 4.4s
len(i_ix) / 1e6
# 49.895545
i = intervals.reindex(i_ix).reset_index(drop=True)
p = points.reindex(p_ix).reset_index(drop=True)
p.columns = ["PStart", "PEnd"]
result = pd.concat([i, p], axis=1)
print(result)
# Start End PStart PEnd
# 0 1713535 1714442 1714420 1714421
# 1 1713560 1714479 1714420 1714421
# 2 1713670 1714590 1714420 1714421
# 3 1713677 1714666 1714420 1714421
# 4 1713694 1714627 1714420 1714421
# ... ... ... ... ...
# 49895540 9732449 9732910 9732498 9732499
# 49895541 9732491 9733159 9732498 9732499
# 49895542 9732492 9732621 9732498 9732499
# 49895543 9732496 9732653 9732498 9732499
# 49895544 9732496 9732512 9732498 9732499
#
# [49895545 rows x 4 columns]