我正在使用我在网上找到的代码,我想尝试不同的分支预测代码以更好地理解分支预测器。
CPU是AMD锐龙3600。
基本上,我正在做的是在下面的代码中,我试图测量给定函数/代码段的误预测率。作为伪/缩短的代码,这就是我所做的:
int measurement(int length, int arr){
r1 = papi_read();
for(;i<len;){
if(arr[j]){
do_sth();
}
}
r2 = papi_read();
return r2-r1;
}
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
init_papi();
init_others(); //creates arrays, initialize them, etc.
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(128, array);
usleep(1200); // 2nd important line
}
print_mispredictions();
}
我的设置
- 我已经隔离了我正在处理的核心,因此该核心中没有其他用户进程。我还隔离了同级内核,以便我完全负责两个内核,除非有中断或例程。
- 以前,我已经看到,如果我
sleep
在迭代之间使用(如在 main 函数中),CPU 会进入更深层次的 C 状态,因此分支预测单元(在本例中为 BHT)会重置。这是2nd important line
代码中的解释。
我想看的
如果没有这sleep
条线,我会看到在每次迭代中我的误预测率越来越低。那是因为分支预测器正在学习数组中的模式。
有了这sleep
条线,我想要实现的是,在每次迭代中,我应该看到类似的错误预测数字,因为 BPU 条目正在被重置。
问题是什么
问题是当我从
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){}
}
}
至
void warmup(){
for(volatile int i = 0; i< 1000000; i++){ // notice that I have the
// same amount of iteration
}
}
然后测量就搞砸了。我在上一个问题中遇到过这种问题,但从未得到回答。更改与测量无关的行会改变测量行为。
这是我的结果:
# With 1 for loop, expected behavior. At every iteration, it is reset to ~ 60
# For 128 if statement, 60 misprediction is 50% guessing.
$ ./exp
0:73 #iteration count, misprediction count
1:62
2:63
3:21
4:63
...
# With 2 for loops. Unexpected behavior. It should always reset to ~ 60 but it keeps decreasing.
./exp
0:66
1:18
2:4
3:4
4:1
5:0
6:0
...
在 the 之后放置mfence
orlfence
指令warmup
也不会改变结果。
下面,我将整个代码放入以防有人想尝试和/或对此行为有答案。
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
#include <time.h>
#include "papi.h"
#define stick_this_thread_to_core(retval, core_id){ \
int num_cores = sysconf(_SC_NPROCESSORS_ONLN); \
if (core_id < 0 || core_id >= num_cores) \
retval = EINVAL; \
cpu_set_t cpuset; \
CPU_ZERO(&cpuset); \
CPU_SET(core_id, &cpuset); \
retval = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);\
}
#define ERROR_RETURN(retval) { \
fprintf(stderr, "Error %d, (%s) %s:line %d: \n", retval,PAPI_strerror(retval), __FILE__,__LINE__); \
exit(retval); \
}
void papi_my_native_add_event(int* EventSet1, char* eventname, int *native){
int retval;
//printf("native add\n");
if((retval = PAPI_event_name_to_code(eventname, native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add to_code is successful\n");
if ((retval = PAPI_add_event(*EventSet1, *native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add add_event is successful\n");
int number = 0;
if((retval = PAPI_list_events(*EventSet1, NULL, &number)) != PAPI_OK)
ERROR_RETURN(retval);
//fprintf(stderr, "Added %d events.\n", number);
}
void papi_my_native_add_start_event(int* EventSet1, char* eventname, int *native){
papi_my_native_add_event(EventSet1, eventname, native);
int retval = 0;
if((retval = PAPI_start(*EventSet1)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("START %s\n", eventname);
}
int RNG_SIZE = 128;
uint64_t* rng_arr;
uint64_t dummy;
// 12th core
int cpuid = 11;
// Code from
// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2019/11/05
// acts like a random number generator, but it is deterministic.
static inline uint64_t rng(uint64_t h) {
h ^= h >> 33;
h *= UINT64_C(0xff51afd7ed558ccd);
h ^= h >> 33;
h *= UINT64_C(0xc4ceb9fe1a85ec53);
h ^= h >> 33;
return h;
}
uint64_t measurement(int* EventSet, uint64_t howmany, uint64_t* arr){
long long reads[2] = {0};
PAPI_read(*EventSet, &reads[0]);
for(int j = 0; j < howmany; j++){
if(arr[j]){
dummy &= arr[j];
}
}
PAPI_read(*EventSet, &reads[1]);
return (reads[1] - reads[0]);
}
void precompute_rng(){
int howmany = RNG_SIZE;
for(int i = 0; i < RNG_SIZE; i++){
rng_arr[i] = rng(howmany) &0x1;
howmany--;
}
}
int stick_to_core(){
int retval = 0;
stick_this_thread_to_core(retval, cpuid);
if(retval){
printf("Affinity error: %s\n", strerror(errno));
return 1;
}
return 0;
}
void init_papi(int* EventSet, int cpuid){
int retval = 0;
// papi init
if((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT )
ERROR_RETURN(retval);
PAPI_option_t opts1;
opts1.cpu.cpu_num = cpuid;
if((retval = PAPI_create_eventset(EventSet)) != PAPI_OK)
ERROR_RETURN(retval);
if((retval = PAPI_assign_eventset_component(*EventSet, 0)) != PAPI_OK)
ERROR_RETURN(retval);
opts1.cpu.eventset = *EventSet;
if((retval =PAPI_set_opt(PAPI_CPU_ATTACH, &opts1)) != PAPI_OK)
ERROR_RETURN(retval);
char* eventname = "RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED";
unsigned int native = 0x0;
papi_my_native_add_start_event(EventSet, eventname, &native);
}
void warmup(){
for(volatile int i = 0; i< 100000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
if(stick_to_core()){
printf("Error on sticking to the core\n");
return 1;
}
int EventSet = PAPI_NULL;
int* EventSetPtr = &EventSet;
init_papi(EventSetPtr, cpuid);
rng_arr = (uint64_t*) malloc(RNG_SIZE * sizeof(uint64_t));
precompute_rng(cpuid);
int iter = 4096;
uint64_t* results = (uint64_t*) malloc(iter * sizeof(uint64_t));
for(int i = 0; i < iter; i++)
results[i] = 0;
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(&EventSet, RNG_SIZE, rng_arr);
usleep(1200);
}
// prints
for(int i = 0; i < 20; i++){
printf("%d:%ld\n", i, results[i]);
}
printf("\n");
free(results);
return 0;
}
编译
gcc -O0 main.c -lpthread -lpapi -o exp