只是在这里大声思考,但让我们看看这在算法上是否有意义:
对于每一页,存储三个字段:
当用户点击页面时,为该用户生成哈希。如果该散列已经在布隆过滤器中,只需碰撞视图计数器
如果它不在布隆过滤器中,则碰撞两个计数器并将该哈希添加到过滤器中。但是,根据您的哈希值,布隆过滤器成员资格检查中可能会出现误报(但绝不会出现误报),因此请注意如何选择哈希算法。
三个领域。还不错。:)
参考:维基百科上的布隆过滤器
编辑:我已经让这段代码漂浮了一段时间 - 不确定核心最初来自哪里,但多年来我已经对其进行了调整 - 为 LINQPad 做好了准备:
void Main()
{
var estimatedCount = 100000;
var falsePositiveProbability = 0.001;
var falsePositiveCount = 0;
var memberCount = 0;
var bloom = BloomFilter<char>.Create(
estimatedCount,
falsePositiveProbability,
c => c.GetHashCode(),
c => (int)c);
var allChars = Enumerable.Range(0, 0xffff).Select(i => (char)i).ToList();
foreach(var c in allChars)
{
var alreadyIn = bloom.Test(c);
if(alreadyIn)
{
falsePositiveCount++;
}
bloom.Add(c);
memberCount++;
}
Console.WriteLine("Predicted count: {0} Predicted false positive: {1:p} ", estimatedCount, falsePositiveProbability);
Console.WriteLine("Actual false positive count: {0} Actual member count: {1} ", falsePositiveCount, memberCount);
Console.WriteLine("False positive rate: {0:p}", ((double)falsePositiveCount / memberCount));
}
// Define other methods and classes here
public class BloomFilter<TValue>
{
private BitArray hashbits;
private int numKeys;
private Func<TValue,int> _hashFunc1;
private Func<TValue,int> _hashFunc2;
public static BloomFilter<TValue> Create(int estimateCount, double falsePositiveRate, Func<TValue,int> hash1, Func<TValue,int> hash2)
{
// formulae courtesy of http://hur.st/bloomfilter
var tableSize = Math.Ceiling((estimateCount * Math.Log(falsePositiveRate)) / Math.Log(1.0 / (Math.Pow(2.0, Math.Log(2.0)))));
var keyCount = Math.Round(Math.Log(2.0) * tableSize / estimateCount);
return new BloomFilter<TValue>((int)tableSize, (int)keyCount)
{
_hashFunc1 = hash1,
_hashFunc2 = hash2
};
}
private BloomFilter(int tableSize, int nKeys)
{
numKeys = nKeys;
hashbits = new BitArray(tableSize);
}
public bool Test(TValue val)
{
var hashKeys = GenerateHashes(val);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
return false;
}
return true;
}
public bool Add(TValue val)
{
bool rslt = true;
var hashKeys = GenerateHashes(val);
foreach (int hash in hashKeys)
{
if (!hashbits[hash])
{
rslt = false;
hashbits[hash] = true;
}
}
return rslt;
}
private int[] GenerateHashes(TValue val)
{
int hash1 = _hashFunc1(val);
int hash2 = _hashFunc2(val);
var hashKeys = new int[numKeys];
hashKeys[0] = Math.Abs(hash1 % hashbits.Count);
if (numKeys > 1)
{
for (int i = 1; i < numKeys; i++)
{
hashKeys[i] = Math.Abs((hash1 + (i * hash2)) %
hashbits.Count);
}
}
return hashKeys;
}
}