我现在正在使用这种方法,它一次交错四个 popcnt 操作。它基于这个 C 实现。
private static final long M0=0x5555555555555555L,
M1=0x3333333333333333L,
M2=0x0f0f0f0f0f0f0f0fL;
public void store4Tags(long tag0, long tag1, long tag2, long tag3) {
long count0 = tag0,
count1 = tag1,
count2 = tag2,
count3 = tag3;
count0 = (count0 & M0) + ((count0 >>> 1) & M0);
count1 = (count1 & M0) + ((count1 >>> 1) & M0);
count2 = (count2 & M0) + ((count2 >>> 1) & M0);
count3 = (count3 & M0) + ((count3 >>> 1) & M0);
count0 = (count0 & M1) + ((count0 >>> 2) & M1);
count1 = (count1 & M1) + ((count1 >>> 2) & M1);
count2 = (count2 & M1) + ((count2 >>> 2) & M1);
count3 = (count3 & M1) + ((count3 >>> 2) & M1);
count0 = (count0 + (count0 >>> 4)) & M2;
count1 = (count1 + (count1 >>> 4)) & M2;
count2 = (count2 + (count2 >>> 4)) & M2;
count3 = (count3 + (count3 >>> 4)) & M2;
count0 += count0 >>> 8;
count1 += count1 >>> 8;
count2 += count2 >>> 8;
count3 += count3 >>> 8;
count0 += count0 >>> 16;
count1 += count1 >>> 16;
count2 += count2 >>> 16;
count3 += count3 >>> 16;
count0 += count0 >>> 32;
count1 += count1 >>> 32;
count2 += count2 >>> 32;
count3 += count3 >>> 32;
storeWithPopCnt(tag0, 0x3f & (int) count0);
storeWithPopCnt(tag1, 0x3f & (int) count1);
storeWithPopCnt(tag2, 0x3f & (int) count2);
storeWithPopCnt(tag3, 0x3f & (int) count3);
}
这稍微优于查找表版本,并且不消耗缓存。