这个问题是在 C++ 中搜索逻辑移位的第一个结果。
因此,回答允许强制转换的一般情况也是有意义的- 因为此处显示的代码均未使用正确(且快速)的操作码(仅一条shr
指令而不是sar
)。
演员表
此版本也适用于即将推出的int128_t
(目前__int128
在 GCC、Clang 和 ICC 中)和其他未来类型。如果您被允许使用type_traits
并且您的代码应该在将来工作而不考虑正确的无符号类型,那么您应该将它用于正确的强制转换。
代码
#include <type_traits>
template<class T>
inline T logicalShift(T t1, T t2) {
return
static_cast<
typename std::make_unsigned<T>::type
>(t1) >> t2;
}
汇编代码分析
将其打包成f(T x, T y) { return logicalShift(x, y); }
以下汇编程序指令(GCC9.2,-O3)的结果:
f(int, int):
mov eax, edi
mov ecx, esi
shr eax, cl
ret
f(unsigned int, unsigned int):
mov eax, edi
mov ecx, esi
shr eax, cl
ret
f(__int128, __int128):
mov rcx, rdx
mov rax, rdi
mov rdx, rsi
shrd rax, rsi, cl
shr rdx, cl
xor esi, esi
and ecx, 64
cmovne rax, rdx
cmovne rdx, rsi
ret
f(unsigned __int128, unsigned __int128):
mov rcx, rdx
mov rax, rdi
mov rdx, rsi
shrd rax, rsi, cl
shr rdx, cl
xor esi, esi
and ecx, 64
cmovne rax, rdx
cmovne rdx, rsi
ret
结果T a, b; T c = a >> b;
:
f(int, int):
mov eax, edi # 32-bit registers
mov ecx, esi
sar eax, cl # lower 8-bit of cx register
ret
f(long, long):
mov rax, rdi # 64-bit registers
mov ecx, esi
sar rax, cl
ret
f(unsigned int, unsigned int):
mov eax, edi
mov ecx, esi
shr eax, cl
ret
f(__int128, __int128):
mov rcx, rdx
mov rdx, rsi
mov rax, rdi
sar rdx, cl
shrd rax, rsi, cl
mov rsi, rdx
sar rsi, 63
and ecx, 64
cmovne rax, rdx
cmovne rdx, rsi
ret
我们看到,区别主要只是shr
代替sar
(对于__int128还有一些)。什么代码可以更快?
无演员版本
(将指令集简化为 ~ & ^ | + << >>)
问题 - 左移 ( SAL
, SHL
)
@Fingolfin 最初的想法很好。但是我们的处理器不会做我们首先想到的事情int mask = ~0 << n
,n >= 32;
但是为什么呢?
C++ 标准(草案 N4713, 8.5.7, 2nd)说 <<:
的值E1 << E2
是E1
左移的E2
位位置;空出的位是零填充的。如果E1
具有无符号类型,则结果的值为 ,以E1 × 2^E2
比结果类型中可表示的最大值大一为模减少。否则,如果E1
具有带符号类型和非负值,并且E1 × 2^E2
可以在结果类型的相应无符号类型中表示,则转换为结果类型的该值就是结果值;否则,行为是 undefined。
听起来像(E1 << E2) % (UINTxx_MAX + 1)
,我们只是从右边填充 0 并用模运算切断前导位。简单明了。
分析左移
16 位短、32 位整数和 64 位长的汇编代码(GCC 9.2,-O3)是:
g(short, short):
movsx eax, di # 16-bit to 32-bit register
mov ecx, esi
sal eax, cl # 1st is 32-bit, 2nd is 8-bit register
ret
g(int, int):
mov eax, edi # 32-bit registers
mov ecx, esi
sal eax, cl # 1st is 32-bit, 2nd is 8-bit register
ret
g(long, long):
mov rax, rdi # 64-bit registers
mov ecx, esi
sal rax, cl # 1st is 64-bit, 2nd is 8-bit register
ret
所以,我们讨论了我们从~0 << i
for断言的int i = 0; i <= 33; i++
内容,但我们真正得到了什么?
0: 11111111111111111111111111111111
1: 11111111111111111111111111111110
2: 11111111111111111111111111111100
3: 11111111111111111111111111111000
4: 11111111111111111111111111110000
5: 11111111111111111111111111100000
6: 11111111111111111111111111000000
7: 11111111111111111111111110000000
8: 11111111111111111111111100000000
9: 11111111111111111111111000000000
10: 11111111111111111111110000000000
11: 11111111111111111111100000000000
12: 11111111111111111111000000000000
13: 11111111111111111110000000000000
14: 11111111111111111100000000000000
15: 11111111111111111000000000000000
16: 11111111111111110000000000000000
17: 11111111111111100000000000000000
18: 11111111111111000000000000000000
19: 11111111111110000000000000000000
20: 11111111111100000000000000000000
21: 11111111111000000000000000000000
22: 11111111110000000000000000000000
23: 11111111100000000000000000000000
24: 11111111000000000000000000000000
25: 11111110000000000000000000000000
26: 11111100000000000000000000000000
27: 11111000000000000000000000000000
28: 11110000000000000000000000000000
29: 11100000000000000000000000000000
30: 11000000000000000000000000000000
31: 10000000000000000000000000000000
32: 11111111111111111111111111111111
33: 11111111111111111111111111111110
我们看到,结果更像~0 << (i%2^5)
.
所以,看一下 long(long aka.int64_t)案例:(用 MSVC for x86 编译)
0: 1111111111111111111111111111111111111111111111111111111111111111
1: 1111111111111111111111111111111111111111111111111111111111111110
2: 1111111111111111111111111111111111111111111111111111111111111100
3: 1111111111111111111111111111111111111111111111111111111111111000
4: 1111111111111111111111111111111111111111111111111111111111110000
5: 1111111111111111111111111111111111111111111111111111111111100000
6: 1111111111111111111111111111111111111111111111111111111111000000
7: 1111111111111111111111111111111111111111111111111111111110000000
8: 1111111111111111111111111111111111111111111111111111111100000000
9: 1111111111111111111111111111111111111111111111111111111000000000
10: 1111111111111111111111111111111111111111111111111111110000000000
11: 1111111111111111111111111111111111111111111111111111100000000000
12: 1111111111111111111111111111111111111111111111111111000000000000
13: 1111111111111111111111111111111111111111111111111110000000000000
14: 1111111111111111111111111111111111111111111111111100000000000000
15: 1111111111111111111111111111111111111111111111111000000000000000
16: 1111111111111111111111111111111111111111111111110000000000000000
17: 1111111111111111111111111111111111111111111111100000000000000000
18: 1111111111111111111111111111111111111111111111000000000000000000
19: 1111111111111111111111111111111111111111111110000000000000000000
20: 1111111111111111111111111111111111111111111100000000000000000000
21: 1111111111111111111111111111111111111111111000000000000000000000
22: 1111111111111111111111111111111111111111110000000000000000000000
23: 1111111111111111111111111111111111111111100000000000000000000000
24: 1111111111111111111111111111111111111111000000000000000000000000
25: 1111111111111111111111111111111111111110000000000000000000000000
26: 1111111111111111111111111111111111111100000000000000000000000000
27: 1111111111111111111111111111111111111000000000000000000000000000
28: 1111111111111111111111111111111111110000000000000000000000000000
29: 1111111111111111111111111111111111100000000000000000000000000000
30: 1111111111111111111111111111111111000000000000000000000000000000
31: 1111111111111111111111111111111110000000000000000000000000000000
32: 1111111111111111111111111111111100000000000000000000000000000000
33: 1111111111111111111111111111111000000000000000000000000000000000
34: 1111111111111111111111111111110000000000000000000000000000000000
35: 1111111111111111111111111111100000000000000000000000000000000000
36: 1111111111111111111111111111000000000000000000000000000000000000
37: 1111111111111111111111111110000000000000000000000000000000000000
38: 1111111111111111111111111100000000000000000000000000000000000000
39: 1111111111111111111111111000000000000000000000000000000000000000
40: 1111111111111111111111110000000000000000000000000000000000000000
41: 1111111111111111111111100000000000000000000000000000000000000000
42: 1111111111111111111111000000000000000000000000000000000000000000
43: 1111111111111111111110000000000000000000000000000000000000000000
44: 1111111111111111111100000000000000000000000000000000000000000000
45: 1111111111111111111000000000000000000000000000000000000000000000
46: 1111111111111111110000000000000000000000000000000000000000000000
47: 1111111111111111100000000000000000000000000000000000000000000000
48: 1111111111111111000000000000000000000000000000000000000000000000
49: 1111111111111110000000000000000000000000000000000000000000000000
50: 1111111111111100000000000000000000000000000000000000000000000000
51: 1111111111111000000000000000000000000000000000000000000000000000
52: 1111111111110000000000000000000000000000000000000000000000000000
53: 1111111111100000000000000000000000000000000000000000000000000000
54: 1111111111000000000000000000000000000000000000000000000000000000
55: 1111111110000000000000000000000000000000000000000000000000000000
56: 1111111100000000000000000000000000000000000000000000000000000000
57: 1111111000000000000000000000000000000000000000000000000000000000
58: 1111110000000000000000000000000000000000000000000000000000000000
59: 1111100000000000000000000000000000000000000000000000000000000000
60: 1111000000000000000000000000000000000000000000000000000000000000
61: 1110000000000000000000000000000000000000000000000000000000000000
62: 1100000000000000000000000000000000000000000000000000000000000000
63: 1000000000000000000000000000000000000000000000000000000000000000
64: 0000000000000000000000000000000000000000000000000000000000000000
65: 0000000000000000000000000000000000000000000000000000000000000000
繁荣!
(直到 31 在 GCC 中也保持简短,因为它使用 32 位EAX
寄存器sal
)
但是,此结果仅由编译器创建:
x86 msvc v19.22 , /O2:
_x$ = 8 ; size = 8
_y$ = 16 ; size = 8
__int64 g(__int64,__int64) PROC ; g, COMDAT
mov eax, DWORD PTR _x$[esp-4]
mov edx, DWORD PTR _x$[esp]
mov ecx, DWORD PTR _y$[esp-4]
jmp __allshl
__int64 g(__int64,__int64) ENDP
x64 msvc v19.22 , /O2:
x$ = 8
y$ = 16
__int64 g(__int64,__int64) PROC ; g, COMDAT
mov rax, rcx
mov rcx, rdx
shl rax, cl
ret 0
__int64 g(__int64,__int64) ENDP
并且 x64 MSVC 代码显示与 GCC 9.2 代码相同的行为 - 使用shl
而不是sal
.
从那时起,我们现在知道处理器本身(英特尔酷睿第六代)仅使用cl
寄存器的最后一位,这取决于移位操作的第一个寄存器的长度,即使 C++ 标准另有规定。
一个小修复
所以,这就是代码中断的地方。本能地我会采取 shiftAmount of32 - n
并进入上层问题,您已经通过使用 a shiftAmount
of避免了这种情况31 - n
。知道 n 是 0..31,没有 shiftAmount 32。很好。
但是,一方面减少意味着另一方面增加。现在mask
需要从-2
(我们不换档0b1111
,我们换档0b1110
)开始:
int logSh3(int x, int n) {
int mask = ~2 + 1;
int shiftAmount = 31 + ((~n) + 1);//this evaluates to 31 - n on two's complement machines
mask = mask << shiftAmount;
mask = ~mask;//If n equals 0, it means we have negated all bits and hence have mask = 0
x = x >> n;
return x & mask;
}
它有效!
备用代码和分析
芬国昐密码(固定)
上层代码,作为汇编程序(GCC 9.2,-O3):
logSh3(int, int):
mov ecx, 31
mov edx, -2
mov eax, edi
sub ecx, esi
sal edx, cl
mov ecx, esi
not edx
sar eax, cl
and eax, edx
ret
9 条指令
哈罗德密码
int logSh2(int x, int n) {
int shiftAmount = 31 + ((~n) + 1);//this evaluates to 31 - n on two's complement machines
int mask = 1 << shiftAmount;
mask |= mask + ((~1) + 1);
x = x >> n;
return x & mask;
}
logSh2(int, int):
mov ecx, esi
mov r8d, edi
mov edi, -2147483648
shr edi, cl
sar r8d, cl
lea eax, [rdi-1]
or eax, edi
and eax, r8d
ret
8条指令
我们能做得更好吗?
另一种解决方案
代替左移,我们可以对 进行右移0b1000
,将其向后移一并反转。
int logSh4(int x, int n) {
int mask = 0x80000000;
mask = mask >> n;
mask = mask << 1;
mask = ~mask;//If n equals 0, it means we have negated all bits and hence have mask = 0
x = x >> n;
return x & mask;
}
logSh4(int, int):
mov ecx, esi
mov edx, -2147483648
sar edx, cl
sar edi, cl
lea eax, [rdx+rdx]
not eax
and eax, edi
ret
7 指令
更好的方法?
让我们0b0111
向右移动,向左移动一个并加 1。所以我们不用逆:
int logSh5(int x, int n) {
int mask = 0x7fffffff;
mask = mask >> n;
mask = (mask << 1) | 1;
x = x >> n;
return x & mask;
}
logSh5(int, int):
mov ecx, esi
mov eax, 2147483647
sar eax, cl
sar edi, cl
lea eax, [rax+1+rax]
and eax, edi
ret
还剩 6 条指令。美好的。(但简单的演员阵容仍然是实践中的最佳解决方案)