c++ - 使用 STL 容器进行中位数计算时，正确的方法是什么？

Question

假设我需要从 1000000 个随机数值序列中检索中位数。

如果使用除之外的任何东西， std::list我没有（内置）方法对序列进行排序以进行中值计算。

如果使用std::list，我不能随机访问值来检索排序序列的中间（中位数）。

自己实现排序并使用 egstd::vector更好，还是使用std::list和使用std::list::iteratorfor-loop-walk 到中值更好？后者似乎不那么开销，但也感觉更难看..

或者我有更多更好的选择吗？

score 116 · Accepted Answer

任何随机访问容器（如std::vector）都可以使用标头中提供的标准std::sort算法进行排序<algorithm>。

std::nth_element为了找到中位数，使用;会更快。这足以将一个选定的元素放在正确的位置，但不能完全对容器进行排序。所以你可以找到这样的中位数：

int median(vector<int> &v)
{
    size_t n = v.size() / 2;
    nth_element(v.begin(), v.begin()+n, v.end());
    return v[n];
}

score 42 · Accepted Answer

中位数比 Mike Seymour 的答案更复杂。中位数的不同取决于样本中的项目数是偶数还是奇数。如果项目数为偶数，则中位数为中间两项的平均值。这意味着整数列表的中位数可以是分数。最后，空列表的中位数是未定义的。这是通过我的基本测试用例的代码：

///Represents the exception for taking the median of an empty list
class median_of_empty_list_exception:public std::exception{
  virtual const char* what() const throw() {
    return "Attempt to take the median of an empty list of numbers.  "
      "The median of an empty list is undefined.";
  }
};

///Return the median of a sequence of numbers defined by the random
///access iterators begin and end.  The sequence must not be empty
///(median is undefined for an empty set).
///
///The numbers must be convertible to double.
template<class RandAccessIter>
double median(RandAccessIter begin, RandAccessIter end) 
  if(begin == end){ throw median_of_empty_list_exception(); }
  std::size_t size = end - begin;
  std::size_t middleIdx = size/2;
  RandAccessIter target = begin + middleIdx;
  std::nth_element(begin, target, end);

  if(size % 2 != 0){ //Odd number of elements
    return *target;
  }else{            //Even number of elements
    double a = *target;
    RandAccessIter targetNeighbor= target-1;
    std::nth_element(begin, targetNeighbor, end);
    return (a+*targetNeighbor)/2.0;
  }
}

score 15 · Accepted Answer

该算法使用 STL nth_element (amortized O(N)) 算法和 max_element 算法 (O(n)) 有效地处理偶数和奇数大小的输入。请注意， nth_element 有另一个保证的副作用，即之前n的所有元素都保证小于v[n]，只是不一定排序。

//post-condition: After returning, the elements in v may be reordered and the resulting order is implementation defined.
double median(vector<double> &v)
{
  if(v.empty()) {
    return 0.0;
  }
  auto n = v.size() / 2;
  nth_element(v.begin(), v.begin()+n, v.end());
  auto med = v[n];
  if(!(v.size() & 1)) { //If the set size is even
    auto max_it = max_element(v.begin(), v.begin()+n);
    med = (*max_it + med) / 2.0;
  }
  return med;    
}

score 11 · Accepted Answer

这是 Mike Seymour 答案的更完整版本：

// Could use pass by copy to avoid changing vector
double median(std::vector<int> &v)
{
  size_t n = v.size() / 2;
  std::nth_element(v.begin(), v.begin()+n, v.end());
  int vn = v[n];
  if(v.size()%2 == 1)
  {
    return vn;
  }else
  {
    std::nth_element(v.begin(), v.begin()+n-1, v.end());
    return 0.5*(vn+v[n-1]);
  }
}

它处理奇数或偶数长度的输入。

score 6 · Accepted Answer

把这个线程的所有见解放在一起，我最终有了这个例程。它适用于任何 stl 容器或任何提供输入迭代器的类，并处理奇数和偶数大小的容器。它还在容器的副本上工作，不修改原始内容。

template <typename T = double, typename C>
inline const T median(const C &the_container)
{
    std::vector<T> tmp_array(std::begin(the_container), 
                             std::end(the_container));
    size_t n = tmp_array.size() / 2;
    std::nth_element(tmp_array.begin(), tmp_array.begin() + n, tmp_array.end());

    if(tmp_array.size() % 2){ return tmp_array[n]; }
    else
    {
        // even sized vector -> average the two middle values
        auto max_it = std::max_element(tmp_array.begin(), tmp_array.begin() + n);
        return (*max_it + tmp_array[n]) / 2.0;
    }
}

score 4 · Accepted Answer

您可以std::vector使用库函数对 a 进行排序std::sort。

std::vector<int> vec;
// ... fill vector with stuff
std::sort(vec.begin(), vec.end());

score 2 · Accepted Answer

存在线性时间选择算法。下面的代码仅在容器具有随机访问迭代器时才有效，但可以修改它以使其在没有随机访问迭代器的情况下工作——您只需要更加小心以避免使用和之类的快捷end - begin方式iter + n。

#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <vector>

template<class A, class C = std::less<typename A::value_type> >
class LinearTimeSelect {
public:
    LinearTimeSelect(const A &things) : things(things) {}
    typename A::value_type nth(int n) {
        return nth(n, things.begin(), things.end());
    }
private:
    static typename A::value_type nth(int n,
            typename A::iterator begin, typename A::iterator end) {
        int size = end - begin;
        if (size <= 5) {
            std::sort(begin, end, C());
            return begin[n];
        }
        typename A::iterator walk(begin), skip(begin);
#ifdef RANDOM // randomized algorithm, average linear-time
        typename A::value_type pivot = begin[std::rand() % size];
#else // guaranteed linear-time, but usually slower in practice
        while (end - skip >= 5) {
            std::sort(skip, skip + 5);
            std::iter_swap(walk++, skip + 2);
            skip += 5;
        }
        while (skip != end) std::iter_swap(walk++, skip++);
        typename A::value_type pivot = nth((walk - begin) / 2, begin, walk);
#endif
        for (walk = skip = begin, size = 0; skip != end; ++skip)
            if (C()(*skip, pivot)) std::iter_swap(walk++, skip), ++size;
        if (size <= n) return nth(n - size, walk, end);
        else return nth(n, begin, walk);
    }
    A things;
};

int main(int argc, char **argv) {
    std::vector<int> seq;
    {
        int i = 32;
        std::istringstream(argc > 1 ? argv[1] : "") >> i;
        while (i--) seq.push_back(i);
    }
    std::random_shuffle(seq.begin(), seq.end());
    std::cout << "unordered: ";
    for (std::vector<int>::iterator i = seq.begin(); i != seq.end(); ++i)
        std::cout << *i << " ";
    LinearTimeSelect<std::vector<int> > alg(seq);
    std::cout << std::endl << "linear-time medians: "
        << alg.nth((seq.size()-1) / 2) << ", " << alg.nth(seq.size() / 2);
    std::sort(seq.begin(), seq.end());
    std::cout << std::endl << "medians by sorting: "
        << seq[(seq.size()-1) / 2] << ", " << seq[seq.size() / 2] << std::endl;
    return 0;
}

score 2 · Accepted Answer

这是一个考虑@MatthieuM 建议的答案。即不修改输入向量。它对偶数和奇数基数的范围使用单个部分排序（在索引向量上），而空范围由向量at方法抛出的异常处理：

double median(vector<int> const& v)
{
    bool isEven = !(v.size() % 2); 
    size_t n    = v.size() / 2;

    vector<size_t> vi(v.size()); 
    iota(vi.begin(), vi.end(), 0); 

    partial_sort(begin(vi), vi.begin() + n + 1, end(vi), 
        [&](size_t lhs, size_t rhs) { return v[lhs] < v[rhs]; }); 

    return isEven ? 0.5 * (v[vi.at(n-1)] + v[vi.at(n)]) : v[vi.at(n)];
}

演示

score 1 · Accepted Answer

犰狳有一个看起来像答案https://stackoverflow.com/a/34077478 by https://stackoverflow.com/users/2608582/matthew-fioravante中的实现

它使用一个调用nth_element和一个调用max_element，它在这里： https ://gitlab.com/conradsnicta/armadillo-code/-/blob/9.900.x/include/armadillo_bits/op_median_meat.hpp#L380

//! find the median value of a std::vector (contents is modified)
template<typename eT>
inline 
eT
op_median::direct_median(std::vector<eT>& X)
  {
  arma_extra_debug_sigprint();
  
  const uword n_elem = uword(X.size());
  const uword half   = n_elem/2;
  
  typename std::vector<eT>::iterator first    = X.begin();
  typename std::vector<eT>::iterator nth      = first + half;
  typename std::vector<eT>::iterator pastlast = X.end();
  
  std::nth_element(first, nth, pastlast);
  
  if((n_elem % 2) == 0)  // even number of elements
    {
    typename std::vector<eT>::iterator start   = X.begin();
    typename std::vector<eT>::iterator pastend = start + half;
    
    const eT val1 = (*nth);
    const eT val2 = (*(std::max_element(start, pastend)));
    
    return op_mean::robust_mean(val1, val2);
    }
  else  // odd number of elements
    {
    return (*nth);
    }
  }

score 0 · Accepted Answer

you can use this approch. It also takes care of sliding window.
Here days are no of trailing elements for which we want to find median and this makes sure the original container is not changed


#include<bits/stdc++.h>

using namespace std;

int findMedian(vector<int> arr, vector<int> brr, int d, int i)
{
    int x,y;
    x= i-d;
    y=d;
    brr.assign(arr.begin()+x, arr.begin()+x+y);


    sort(brr.begin(), brr.end());

    if(d%2==0)
    {
        return((brr[d/2]+brr[d/2 -1]));
    }

    else
    {
        return (2*brr[d/2]);
    }

    // for (int i = 0; i < brr.size(); ++i)
    // {
    //     cout<<brr[i]<<" ";
    // }

    return 0;

}

int main()
{
    int n;
    int days;
    int input;
    int median;
    int count=0;

    cin>>n>>days;

    vector<int> arr;
    vector<int> brr;

    for (int i = 0; i < n; ++i)
    {
        cin>>input;
        arr.push_back(input);
    }

    for (int i = days; i < n; ++i)
    {
        median=findMedian(arr,brr, days, i);

        
    }



    return 0;
}

c++ - 使用 STL 容器进行中位数计算时，正确的方法是什么？

10 回答 10

Related

Reference