这是我使用的代码
/// <summary>
/// Defines an interface to calculate relevant
/// to the input complexity of a string
/// </summary>
public interface IStringComplexity
{
double GetCompressionRatio(string input);
double GetRelevantComplexity(double min, double max, double current);
}
以及实现它的类
public class GZipStringComplexity : IStringComplexity
{
public double GetCompressionRatio(string input)
{
if (string.IsNullOrEmpty(input))
throw new ArgumentNullException();
byte[] inputBytes = Encoding.UTF8.GetBytes(input);
byte[] compressed;
using (MemoryStream outStream = new MemoryStream())
{
using (var zipStream = new GZipStream(
outStream, CompressionMode.Compress))
{
using (var memoryStream = new MemoryStream(inputBytes))
{
memoryStream.CopyTo(zipStream);
}
}
compressed = outStream.ToArray();
}
return (double)inputBytes.Length / compressed.Length;
}
/// <summary>
/// Returns relevant complexity of a string on a scale [0..1],
/// where <value>0</value> has very low complexity
/// and <value>1</value> has maximum complexity
/// </summary>
/// <param name="min">minimum compression ratio observed</param>
/// <param name="max">maximum compression ratio observed</param>
/// <param name="current">the value of compression ration
/// for which complexity is being calculated</param>
/// <returns>A relative complexity of a string</returns>
public double GetRelevantComplexity(double min, double max, double current)
{
return 1 - current / (max - min);
}
}
这是您如何使用它的方法
class Program
{
static void Main(string[] args)
{
IStringComplexity c = new GZipStringComplexity();
string input1 = "HHHFHHFFHHFHHFFHHFHHHFHAAAAHHHFHHFFHHFHHFFHHFHHHFHAAAAHHHFHHFFHHFHHFFHHFHHHFHAAAAHHHFHHFFHHFHHFFHHFHHHFH";
string input2 = "mlcllltlgvalvcgvpamdipqtkqdlelpklagtwhsmamatnnislmatlkaplrvhitsllptpednleivlhrwennscvekkvlgektenpkkfkinytvaneatlldtdydnflflclqdtttpiqsmmcqylarvlveddeimqgfirafrplprhlwylldlkqmeepcrf";
string inputMax = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
double ratio1 = c.GetCompressionRatio(input1); //2.9714285714285715
double ratio2 = c.GetCompressionRatio(input2); //1.3138686131386861
double ratioMax = c.GetCompressionRatio(inputMax); //7.5
double complexity1 = c.GetRelevantComplexity(1, ratioMax, ratio1); // ~ 0.54
double complexity2 = c.GetRelevantComplexity(1, ratioMax, ratio2); // ~ 0.80
}
}
我发现一些有用的附加信息。
您可以尝试使用7zip 库中的 LZMA、LZMA2 或 PPMD 。这些相对容易设置,并且提供您可以实现多种压缩算法的接口。我发现这些算法的压缩性能比 GZip 好得多,但如果你把压缩比放在一个比例上,这并不重要。
如果您需要一个归一化的值,例如从 0 到 1,则需要首先计算所有序列的压缩比。这是因为您无法确定可能的最大压缩比是多少。