使用pyranges回答,这基本上是撒了生物信息学糖的熊猫。
设置:
import numpy as np
np.random.seed(0)
import pyranges as pr
a = pr.random(int(1e6))
# +--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand |
# | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------|
# | chr1 | 8830650 | 8830750 | + |
# | chr1 | 9564361 | 9564461 | + |
# | chr1 | 44977425 | 44977525 | + |
# | chr1 | 239741543 | 239741643 | + |
# | ... | ... | ... | ... |
# | chrY | 29437476 | 29437576 | - |
# | chrY | 49995298 | 49995398 | - |
# | chrY | 50840129 | 50840229 | - |
# | chrY | 38069647 | 38069747 | - |
# +--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 1,000,000 rows and 4 columns from 25 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
b = pr.random(int(1e6), length=1)
# +--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand |
# | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------|
# | chr1 | 52110394 | 52110395 | + |
# | chr1 | 122640219 | 122640220 | + |
# | chr1 | 162690565 | 162690566 | + |
# | chr1 | 117198743 | 117198744 | + |
# | ... | ... | ... | ... |
# | chrY | 45169886 | 45169887 | - |
# | chrY | 38863683 | 38863684 | - |
# | chrY | 28592193 | 28592194 | - |
# | chrY | 29441949 | 29441950 | - |
# +--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 1,000,000 rows and 4 columns from 25 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
执行:
result = a.join(b, strandedness="same")
# +--------------+-----------+-----------+--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand | Start_b | End_b | Strand_b |
# | (category) | (int32) | (int32) | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------+-----------+-----------+--------------|
# | chr1 | 227348436 | 227348536 | + | 227348516 | 227348517 | + |
# | chr1 | 18901135 | 18901235 | + | 18901191 | 18901192 | + |
# | chr1 | 230131576 | 230131676 | + | 230131636 | 230131637 | + |
# | chr1 | 84829850 | 84829950 | + | 84829903 | 84829904 | + |
# | ... | ... | ... | ... | ... | ... | ... |
# | chrY | 44139791 | 44139891 | - | 44139821 | 44139822 | - |
# | chrY | 51689785 | 51689885 | - | 51689859 | 51689860 | - |
# | chrY | 45379140 | 45379240 | - | 45379215 | 45379216 | - |
# | chrY | 37469479 | 37469579 | - | 37469576 | 37469577 | - |
# +--------------+-----------+-----------+--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 16,153 rows and 7 columns from 24 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
df = result.df
# Chromosome Start End Strand Start_b End_b Strand_b
# 0 chr1 227348436 227348536 + 227348516 227348517 +
# 1 chr1 18901135 18901235 + 18901191 18901192 +
# 2 chr1 230131576 230131676 + 230131636 230131637 +
# 3 chr1 84829850 84829950 + 84829903 84829904 +
# 4 chr1 189088140 189088240 + 189088163 189088164 +
# ... ... ... ... ... ... ... ...
# 16148 chrY 38968068 38968168 - 38968124 38968125 -
# 16149 chrY 44139791 44139891 - 44139821 44139822 -
# 16150 chrY 51689785 51689885 - 51689859 51689860 -
# 16151 chrY 45379140 45379240 - 45379215 45379216 -
# 16152 chrY 37469479 37469579 - 37469576 37469577 -
#
# [16153 rows x 7 columns]