我正在查看zstd存储库中fse实现中使用的规范化算法,但我不理解某些部分:
static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
我尝试使用各种输入运行该代码,我得到了一个很好的结果,因为归一化输出总和也必须等于2^tableLog
. 它似乎是一个阈值表,可以为所有人制定proba -= 1
或proba +=1
为所有人制定proba < 8
。此外,这个vStep
变量对我来说也很神秘=-)
这是完整的功能:
size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
const unsigned* count, size_t total,
unsigned maxSymbolValue, unsigned useLowProbCount)
{
/* Sanity checks */
if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported size */
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */
if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
{ static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
short const lowProbCount = useLowProbCount ? -1 : 1;
U64 const scale = 62 - tableLog;
U64 const step = ZSTD_div64((U64)1<<62, (U32)total); /* <== here, one division ! */
U64 const vStep = 1ULL<<(scale-20);
int stillToDistribute = 1<<tableLog;
unsigned s;
unsigned largest=0;
short largestP=0;
U32 lowThreshold = (U32)(total >> tableLog);
for (s=0; s<=maxSymbolValue; s++) {
if (count[s] == total) return 0; /* rle special case */
if (count[s] == 0) { normalizedCounter[s]=0; continue; }
if (count[s] <= lowThreshold) {
normalizedCounter[s] = lowProbCount;
stillToDistribute--;
} else {
short proba = (short)((count[s]*step) >> scale);
if (proba<8) {
U64 restToBeat = vStep * rtbTable[proba];
proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
}
if (proba > largestP) { largestP=proba; largest=s; }
normalizedCounter[s] = proba;
stillToDistribute -= proba;
} }
if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
/* corner case, need another normalization method */
size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
if (FSE_isError(errorCode)) return errorCode;
}
else normalizedCounter[largest] += (short)stillToDistribute;
}
}
如果你想要文件的链接:你可以看这里
有人知道这些变量的目的是什么吗?它们是否与已知的归一化算法有关?