c - 优化二分查找算法

Question

在二分查找中，我们有两个比较，一个是大于，另一个是小于，否则是中间值。您将如何优化以便我们只需要检查一次？

bool binSearch(int array[], int key, int left, int right)
{

    mid = left + (right-left)/2;
    if (key < array[mid])
        return binSearch(array, key, left, mid-1);
    else if (key > array[mid])
        return binSearch(array, key, mid+1, right);
    else if (key == array[mid])
        return TRUE; // Found

    return FALSE; // Not Found
}

score 17 · Accepted Answer

17

我会先尝试bsearch()标准函数。它很有可能比您的方法表现更好。

于 2009-03-23T15:42:39.797 回答

score 8 · Accepted Answer

不建议按照您描述的方式尝试和优化。来自维基百科上的二进制搜索算法文章：

大约有一半的时间，第一次测试为真，因此只有一次比较 a 和 b，但另一半时间为假，第二次比较是强制的。这太严重了，以至于有些版本被重铸，以至于根本不进行第二次测试，因此直到跨度减少到零才确定相等性，从而排除了提前终止的可能性——请记住，大约一半的时间搜索将发生在小于限制的一次迭代的匹配值上。

通过使用诸如

if a = b then action3
 else if a > b then action2
  else action1;

这将强制对除最后一次搜索之外的所有内容执行两次比较，而不是及早检测相等性（可能看起来如此）。

某些语言，如 Fortran，具有三向比较，允许通过一个分支到三个不同部分的比较来完成此步骤（请参阅三向比较示例的第十行）。如果您的语言不支持三向测试（大多数语言不支持），那么最好进行两次比较。

不过，我建议您查看同一篇文章中的迭代实现。

score 5 · Accepted Answer

我试图重构二进制搜索算法的优化步骤。我从这个迭代版本开始：

int binsearch_1( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  int found=0;
  while( size > 0 ){
    size_t w=size/2;
         if( p[w] <  key  ){ p+=w+1; size-=w+1; }
    else if( p[w] >  key  ){         size =w;   }
    else  /* p[w] == key */{ p+=w; found=1; break; }
  }
  *index=p-array; return found;
}

从循环中消除比较：

int binsearch_2( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  while( size > 0 ){
    size_t w=size/2;
    if( p[w+1] <= key ){ p+=w+1; size-=w+1; } else size =w;
  }
  *index=p-array; return p[0]==key;
}

从循环中展开小尺寸：

int binsearch_3( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  while( size > 8 ){
    size_t w=size/2;
    if( p[w+1] <= key ){ p+=w+1; size-=w+1; } else  size =w;
  }
  if( size==8 ){ if( p[5] <=  key ){ p+=5; size=3; } else size=4; }
  if( size==7 ){ if( p[4] <=  key ){ p+=4; size=3; } else size=3; }
  if( size==6 ){ if( p[4] <=  key ){ p+=4; size=2; } else size=3; }
  if( size==5 ){ if( p[3] <=  key ){ p+=3; size=2; } else size=2; }
  if( size==4 ){ if( p[3] <=  key ){ p+=3; size=1; } else size=2; }
  if( size==3 ){ if( p[2] <=  key ){ p+=2; size=1; } else size=1; }
  if( size==2 ){ if( p[2] <=  key ){ p+=2; size=0; } else size=1; }
  if( size==1 ){ if( p[1] <=  key ){ p+=1;         }              }
  *index=p-array; return p[0]==key;
}

重新排序 if 语句，将特殊情况 [size==pow(2,N)-1] 移到末尾：

int binsearch_4( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  while( size > 8 ){
    size_t w=size/2;
    if( p[w+1] <= key ){ p+=w+1; size-=w+1; } else  size =w;
  }
  if( size==8 ){ if( p[5] <=  key ){ p+=5; size=3; } else size=4; }
  if( size==6 ){ if( p[4] <=  key ){ p+=4; size=2; } else size=3; }
  if( size==5 ){ if( p[3] <=  key ){ p+=3; size=2; } else size=2; }
  if( size==4 ){ if( p[3] <=  key ){ p+=3; size=1; } else size=2; }
  if( size==2 ){ if( p[2] <=  key ){ p+=2; size=0; } else size=1; }

  if( size==7 ){ if( p[4] <=  key ){ p+=4; size=3; } else size=3; }
  if( size==3 ){ if( p[2] <=  key ){ p+=2; size=1; } else size=1; }
  if( size==1 ){ if( p[1] <=  key ){ p+=1;         }              }
  *index=p-array; return p[0]==key;
}

将 if 语句更改为 switch 语句：

int binsearch_5( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  while( size > 8 ){
    size_t w=size/2;
    if( p[w+1] <= key ){ p+=w+1; size-=w+1; } else  size =w;
  }
  if( size==8 ){ if( p[5] <=  key ){ p+=5; size=3; } else size=4; }
  if( size==6 ){ if( p[4] <=  key ){ p+=4; size=2; } else size=3; }
  if( size==5 ){ if( p[3] <=  key ){ p+=3; size=2; } else size=2; }
  if( size==4 ){ if( p[3] <=  key ){ p+=3; size=1; } else size=2; }
  if( size==2 ){ if( p[2] <=  key ){ p+=2; size=0; } else size=1; }

  switch(size){
    case 7: if( p[4] <= key ) p+=4;
    case 3: if( p[2] <= key ) p+=2;
    case 1: if( p[1] <= key ) p+=1;
  }
  *index=p-array; return p[0]==key;
}

扩展 switch 语句以处理所有特殊情况：

int binsearch_6( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  switch( size ){
#define C(n) case ((size_t)1<<n)-1: if( p[1<<(n-1)]<=key ) p+=1<<(n-1); 
#if SIZE_MAX == UINT64_MAX 
                                              C(63) C(62) C(61)
    C(60) C(59) C(58) C(57) C(56) C(55) C(54) C(53) C(52) C(51)
    C(50) C(49) C(48) C(47) C(46) C(45) C(44) C(43) C(42) C(41)
    C(40) C(39) C(38) C(37) C(36) C(35) C(34) C(33) C(32)
#endif 
                                                          C(31)
    C(30) C(29) C(28) C(27) C(26) C(25) C(24) C(23) C(22) C(21)
    C(20) C(19) C(18) C(17) C(16) C(15) C(14) C(13) C(12) C(11)
    C(10) C( 9) C( 8) C( 7) C( 6) C( 5) C( 4) C( 3) C( 2) C( 1)
#undef C 
      break;
    default:
      while( size > 0 ){
        size_t w=size/2;
        if( p[w] < key ){ p+=w+1; size-=w+1; } else size=w;
      }
  }
  *index=p-array; return p[0]==key;
}

从代码中消除一般情况处理： [ w 是最小的数字： w==pow(2,N)-1; 尺寸<=2*(w+1) ]

int binsearch_7( arr_t array[], size_t size, arr_t key, size_t *index ){
  if( !array || !size ) return 0;
  arr_t *p=array;
  size_t w=(size-1)>>1; w|=w>>1; w|=w>>2; w|=w>>4; w|=w>>8; w|=w>>16;
#if SIZE_MAX == UINT64_MAX
  w|=w>>32;
#endif
  if( p[w]<key ) p+=size-w-1;
  switch( w ){
#define C(n) case ((size_t)1<<n)-1: if( p[1<<(n-1)]<=key ) p+=1<<(n-1);
#if SIZE_MAX == UINT64_MAX
                                              C(63) C(62) C(61)
    C(60) C(59) C(58) C(57) C(56) C(55) C(54) C(53) C(52) C(51)
    C(50) C(49) C(48) C(47) C(46) C(45) C(44) C(43) C(42) C(41)
    C(40) C(39) C(38) C(37) C(36) C(35) C(34) C(33) C(32)
#endif
                                                          C(31)
    C(30) C(29) C(28) C(27) C(26) C(25) C(24) C(23) C(22) C(21)
    C(20) C(19) C(18) C(17) C(16) C(15) C(14) C(13) C(12) C(11)
    C(10) C( 9) C( 8) C( 7) C( 6) C( 5) C( 4) C( 3) C( 2) C( 1)
#undef C
  }
  *index=p-array; return p[0]==key;
}

我所做的最后一步是简化案例标签 [从：'((size_t)1<<n)-1' 到：'n'] 但我发现修改后的代码在我的旧 PC 上比以前的版本。

score 4 · Accepted Answer

如果你想优化你的二分搜索算法，你必须用迭代替换递归。请参阅维基百科中的示例。

如果您需要更多详细信息，请告诉我。

score 4 · Accepted Answer

在您发布的代码中，您可以删除最后一个比较。也就是说，如果key不小于array[mid]或大于array[mid]，则根据定义它是相等的。所以你的代码变成：

mid = left + (right-left)/2;
if (key < array[mid])
    return binSearch(array, key, left, mid-1);
else if (key > array[mid])
    return binSearch(array, key, mid+1, right);
else 
    return TRUE; // Found

另请注意，我已删除最后一行。在您的原始代码中，return FALSE;永远不可能执行。我会假设您在检查的开头有一个检查binSearch是否left<= right。

在 C 中，没有办法根据小于、等于、大于进行三向比较/分支，因此您必须进行两次比较才能在三种可能性中进行选择。

score 2 · Accepted Answer

对于整数没关系，不要这样做。

对于更昂贵的比较，使用 -1, 0, 1 表示 <, =, > 就像在 C 库函数 strcmp 或 Java 的 compareTo() 中一样。

score 2 · Accepted Answer

Ganesh M - 如果数组中不存在密钥，那么您的函数将被卡在一个永无止境的循环中。它永远不会返回 FALSE。

如果键不存在，找到插入点的最佳方法是什么？

考虑 <、== 和 > 的条件“如果级联”将仅返回 TRUE，或者永远继续计算。

我需要最佳条件来确定密钥何时被隔离为不存在。

我知道这会减慢搜索速度，但我想减慢搜索速度。

使用 log(N)/log(2) 似乎是个好主意，但是当 N 不是 2 的幂时，事情可能会变得混乱。

也许，我应该选择一个大小为 2 的数组，并使用一个简单的 while 循环？

检查中点 == 是否会增加太多的比较次数？

score 2 · Accepted Answer

// Optimized Binary Search Implementation
int binsearch(int* data, int size, int val)
{
    int result = -1;
    int start, mid, end;
    start = 0;
    end = size - 1;
    mid = (start + end) >> 1;
    while (start <= end && data[mid] != val)
    {
        if (data[mid] > val)
            end = mid - 1;
        else
            start = mid + 1;
        mid = (start + end) >> 1;
    }
    if (val == data[mid])
        result = mid;
    return result;
}

score 1 · Accepted Answer

它是 K&R 第二版中的一个练习题。

While(low <= high && arr[mid]!=num)
{
    if(arr[mid] > num)
    {
       low = mid+1;
    }
    else
    {
       high = mid-1;
    }
    mid = (low+high)/2;
}

if(arr[mid] == num)
{
    printf("found the ugly number");
    return mid;
}

score 1 · Accepted Answer

我在 C# 中使用无分支版本与普通分支版本进行了一些性能测试。分支版本总是获胜。我试图让性能测试产生随机的最坏情况结果，以损害找到或未找到结果的分支，但它可能不是完全最坏的情况。这一切都说得通，这要归功于 Peter Cordes 在https://stackoverflow.com/a/54273248提供的出色链接。

// choose on of these three...
#define BRANCHLESS_UNSAFE // branchless binary search
//#define BRANCHLESS_FROM_SOURCE // copy source from Array.BinarySearch
//#define BRANCHLESS_BUILTIN // Array.BinarySearch

// BRANCHLESS_UNSAFE is on average 4-5 ms slower than BRANCHLESS_BUILTIN on threadripper 1950x

using System;
using System.Diagnostics;
using System.Linq;

namespace BranchlessBinarySearchNamespace
{
    /// <summary>
    /// C# branchless binary search performance test
    /// The built in array binary search is currently faster in release mode, it's possible the compiler is smarter than me :)
    /// </summary>
    public static unsafe class BranchlessBinarySearch
    {
        // if index not found, returns ~(index of element to insert sorted) at.
        public static int BinarySearch<T>(T[] array, T value) where T : IComparable<T>
        {
            int cmp;
            int mid;
            int start = 0;
            int end = array.Length - 1;

#if BRANCHLESS_UNSAFE

            int** ptrArray = stackalloc int*[3];
            ptrArray[0] = &start;
            ptrArray[2] = &end;
            int g0;

#endif

            while (start <= end)
            {
                mid = start + ((end - start) >> 1);
                cmp = array[mid].CompareTo(value);
                if (cmp == 0)
                {
                    return mid;
                }

#if BRANCHLESS_UNSAFE

                // use -1 or 1 to access start or end in array of 3 ptr

                // compiler will turn into non branch most likely
                //g0 = cmp > 0 ? 1 : -1;

                // access sign bit, negate, multiply by 2 and add one. This gives us -1 if less than 0 or 1 if greater than 0.
                g0 = (-(int)((uint)cmp >> 31) * 2) + 1; 

                // index into ptrArr, if g0 is -1 (using end ptr) we access index 0 (1 + -1) which is where end ptr is
                // if g0 is 1 (using start ptr) we access index 2 (1 + 1) which is where start ptr is
                *(ptrArray[1 + g0]) = mid - g0;

#else

                // the above two lines equivelant to this:
                // compiler may be able to optimize this, but the above code is simply an illustration on how to do
                // the binary search logic without any branching logic at all
                else if (cmp > 0)
                {
                    end = mid - 1;
                }
                else
                {
                    start = mid + 1;
                }

#endif

            }
            return ~start;
        }

        private static unsafe void Test(int[] items, int[] find, bool output)
        {
            if (output)
            {
                const bool testBranchless =

#if BRANCHLESS_UNSAFE || BRANCHLESS_FROM_SOURCE

                true;

#else

                false;

#endif

                Console.Write("Testing {0} items, branchless: {1}... ", items.Length, testBranchless);
            }
            Stopwatch sw = Stopwatch.StartNew();

#if BRANCHLESS_UNSAFE || BRANCHLESS_FROM_SOURCE

            for (int i = 0; i < find.Length; i++)
            {
                BinarySearch(items, find[i]);
                //int idx = BinarySearch(items, find[i]);
                //if (idx != Array.BinarySearch(items, find[i]))
                {
                    //throw new ApplicationException("Binary search invalid result");
                }
            }

#else

            for (int i = 0; i < find.Length; i++)
            {
                Array.BinarySearch(items, find[i]);
                //int idx = Array.BinarySearch(items, find[i]);
                //if (idx != Array.BinarySearch(items, find[i])) // ensure same perf characteristic as branchless test
                {
                    //throw new ApplicationException("Binary search invalid result");
                }
            }

#endif

            if (output)
            {
                Console.WriteLine("Time: {0:0.00} ms", sw.Elapsed.TotalMilliseconds);
            }
        }

        private static unsafe void Test(int count, bool output = true)
        {
            Random r = new Random();
            int[] items = new int[count];
            for (int i = 0; i < items.Length; i++)
            {
                items[i] = r.Next(0, items.Length);
            }
            Array.Sort(items);
            int[] find = new int[1024];
            for (int i = 0; i < 1024; i++)
            {
                int found = r.Next(0, 11);
                if (found > 6)
                {
                    // found
                    find[i] = items[r.Next(0, items.Length)];
                }
                else
                {
                    // not found
                    find[i] = (r.Next(0, 2) == 0 ? r.Next(int.MinValue, 0) : r.Next(items.Length + 1, int.MaxValue));
                }
            }
            Test(items, find, output);
        }

        public static void Main()
        {
            for (int i = 0; i < 10; i++)
            {
                for (int j = 10; j <= 10000000; j *= 10)
                {
                    Test(j);
                }
            }
        }
    }
}

score 0 · Accepted Answer

bool binSearch(int array[], int key, int len)
{
    int mid, low, high;

    low = 0;
    right = len - 1;
    mid = low + (high-low)/2;

    while ((low <= high) && (array[mid] != key)) {
         if (key < array[mid]) {
             high = mid - 1;
         } else {
             low = mid + 1;
         }
         mid = low + (high-low)/2;
    }

    if (array[mid] == key) {
        return TRUE;
    }
    return FALSE;
}

score -1 · Accepted Answer

BinarySearch = function (array, value)  
{  
    var l = 0;  
    var r = array.length;  
    var mid;  
    while (l!=r)  
    {  
        mid = Math.round( (r+l)/2 )-1;  
        if(value > array[mid])  
            l=mid+1;  
        else  
            r=mid;  
    }  
    if(array[mid]==value)  
        return mid;  
    else  
    {  
        if( (mid < (array.length-1)) && (array[mid+1]==value) )  
            return mid+1;  
        else  
            return -1; // Not found  
    }  
}

来源：http ://calcs.appspot.com/docview?id=agVjYWxjc3IPCxIIRG9jdW1lbnQY0w8M

c - 优化二分查找算法

12 回答 12

Related

Reference