javascript - 从数组中采样一个随机子集

Question

什么是一种随机抽样的干净方法，而不是从 javascript 中的数组中替换？所以假设有一个数组

x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

我想随机抽取 5 个唯一值；即生成长度为 5 的随机子集。要生成一个随机样本，可以执行以下操作：

x[Math.floor(Math.random()*x.length)];

但是，如果多次执行此操作，则存在多次获取相同条目的风险。

score 54 · Accepted Answer

我建议使用Fisher-Yates shuffle对数组的副本进行洗牌并切片：

function getRandomSubarray(arr, size) {
    var shuffled = arr.slice(0), i = arr.length, temp, index;
    while (i--) {
        index = Math.floor((i + 1) * Math.random());
        temp = shuffled[index];
        shuffled[index] = shuffled[i];
        shuffled[i] = temp;
    }
    return shuffled.slice(0, size);
}

var x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];
var fiveRandomMembers = getRandomSubarray(x, 5);

请注意，这不是获取大数组的小随机子集的最有效方法，因为它会不必要地对整个数组进行混洗。为了获得更好的性能，您可以改为进行部分洗牌：

function getRandomSubarray(arr, size) {
    var shuffled = arr.slice(0), i = arr.length, min = i - size, temp, index;
    while (i-- > min) {
        index = Math.floor((i + 1) * Math.random());
        temp = shuffled[index];
        shuffled[index] = shuffled[i];
        shuffled[i] = temp;
    }
    return shuffled.slice(min);
}

score 18 · Accepted Answer

聚会有点晚了，但这可以通过下划线的新示例方法解决（下划线 1.5.2 - 2013 年 9 月）：

var x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];

var randomFiveNumbers = _.sample(x, 5);

score 6 · Accepted Answer

在我看来，我认为没有必要洗牌整个牌组。您只需要确保您的样本是随机的，而不是您的套牌。您可以做的是size从前面选择数量，然后将采样数组中的每个位置与其中的另一个位置交换。所以，如果你允许替换，你会变得越来越混乱。

function getRandom(length) { return Math.floor(Math.random()*(length)); }

function getRandomSample(array, size) {
    var length = array.length;

    for(var i = size; i--;) {
        var index = getRandom(length);
        var temp = array[index];
        array[index] = array[i];
        array[i] = temp;
    }

    return array.slice(0, size);
}

2*size如果包含该方法，该算法只是slice选择随机样本的步骤。

更随机

为了使样本更加随机，我们可以随机选择样本的起点。但是获取样品要贵一些。

function getRandomSample(array, size) {
    var length = array.length, start = getRandom(length);

    for(var i = size; i--;) {
        var index = (start + i)%length, rindex = getRandom(length);
        var temp = array[rindex];
        array[rindex] = array[index];
        array[index] = temp;
    }
    var end = start + size, sample = array.slice(start, end);
    if(end > length)
        sample = sample.concat(array.slice(0, end - length));
    return sample;
}

使这更加随机的事实是，当您总是对前面的项目进行洗牌时，如果采样数组很大而样本很小，您往往不会经常在样本中得到它们。如果数组不应该始终相同，这将不是问题。所以，这个方法所做的就是改变洗牌区域开始的这个位置。

无更换

为了不必复制采样数组并且不用担心替换，您可以执行以下操作，但它确实为您提供3*size了与2*size.

function getRandomSample(array, size) {
    var length = array.length, swaps = [], i = size, temp;

    while(i--) {
        var rindex = getRandom(length);
        temp = array[rindex];
        array[rindex] = array[i];
        array[i] = temp;
        swaps.push({ from: i, to: rindex });
    }

    var sample = array.slice(0, size);

    // Put everything back.
    i = size;
    while(i--) {
         var pop = swaps.pop();
         temp = array[pop.from];
         array[pop.from] = array[pop.to];
         array[pop.to] = temp;
    }

    return sample;
}

无需更换，更随机

将提供更多随机样本的算法应用于无替换函数：

function getRandomSample(array, size) {
    var length = array.length, start = getRandom(length),
        swaps = [], i = size, temp;

    while(i--) {
        var index = (start + i)%length, rindex = getRandom(length);
        temp = array[rindex];
        array[rindex] = array[index];
        array[index] = temp;
        swaps.push({ from: index, to: rindex });
    }

    var end = start + size, sample = array.slice(start, end);
    if(end > length)
        sample = sample.concat(array.slice(0, end - length));

    // Put everything back.
    i = size;
    while(i--) {
         var pop = swaps.pop();
         temp = array[pop.from];
         array[pop.from] = array[pop.to];
         array[pop.to] = temp;
    }

    return sample;
}

快点...

像所有这些帖子一样，这使用了 Fisher-Yates Shuffle。但是，我删除了复制数组的开销。

function getRandomSample(array, size) {
    var r, i = array.length, end = i - size, temp, swaps = getRandomSample.swaps;

    while (i-- > end) {
        r = getRandom(i + 1);
        temp = array[r];
        array[r] = array[i];
        array[i] = temp;
        swaps.push(i);
        swaps.push(r);
    }

    var sample = array.slice(end);

    while(size--) {
        i = swaps.pop();
        r = swaps.pop();
        temp = array[i];
        array[i] = array[r];
        array[r] = temp;
    }

    return sample;
}
getRandomSample.swaps = [];

score 5 · Accepted Answer

或者...如果您使用 underscore.js...

_und = require('underscore');

...

function sample(a, n) {
    return _und.take(_und.shuffle(a), n);
}

很简单。

score 5 · Accepted Answer

虽然我强烈支持按照Tim Down 的建议使用 Fisher-Yates Shuffle，但这是一种非常简短的方法，可以根据要求实现随机子集，在数学上是正确的，包括空集和给定集本身。

注意解决方案取决于lodash / underscore：

洛达什 v4

const _ = require('loadsh')

function subset(arr) {
    return _.sampleSize(arr, _.random(arr.length))
}

洛达什 v3

const _ = require('loadsh')

function subset(arr) {
    return _.sample(arr, _.random(arr.length));
}

score 5 · Accepted Answer

您可以通过这种方式获得 5 个元素的样本：

var sample = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
.map(a => [a,Math.random()])
.sort((a,b) => {return a[1] < b[1] ? -1 : 1;})
.slice(0,5)
.map(a => a[0]);

您可以将其定义为要在代码中使用的函数：

var randomSample = function(arr,num){ return arr.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).slice(0,num).map(a => a[0]); }

或者将其添加到 Array 对象本身：

    Array.prototype.sample = function(num){ return this.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).slice(0,num).map(a => a[0]); };

如果需要，您可以将代码分开以获得 2 个功能（Shuffle 和 Sample）：

    Array.prototype.shuffle = function(){ return this.map(a => [a,Math.random()]).sort((a,b) => {return a[1] < b[1] ? -1 : 1;}).map(a => a[0]); };
    Array.prototype.sample = function(num){ return this.shuffle().slice(0,num); };

score 2 · Accepted Answer

这是另一个基于 Fisher-Yates Shuffle 的实现。但是这个针对样本大小明显小于数组长度的情况进行了优化。此实现不会扫描整个数组，也不会分配与原始数组一样大的数组。它使用稀疏数组来减少内存分配。

function getRandomSample(array, count) {
    var indices = [];
    var result = new Array(count);
    for (let i = 0; i < count; i++ ) {
        let j = Math.floor(Math.random() * (array.length - i) + i);
        result[i] = array[indices[j] === undefined ? j : indices[j]];
        indices[j] = indices[i] === undefined ? i : indices[i];
    }
    return result;
}

score 2 · Accepted Answer

如果您使用 lodash，则 API 在 4.x 中已更改：

const oneItem = _.sample(arr);
const nItems = _.sampleSize(arr, n);

https://lodash.com/docs#sampleSize

score 2 · Accepted Answer

也许我遗漏了一些东西，但似乎有一个解决方案不需要洗牌的复杂性或潜在的开销：

function sample(array,size) {
  const results = [],
    sampled = {};
  while(results.length<size && results.length<array.length) {
    const index = Math.trunc(Math.random() * array.length);
    if(!sampled[index]) {
      results.push(array[index]);
      sampled[index] = true;
    }
  }
  return results;
}

score 2 · Accepted Answer

很多这些答案都在谈论克隆、改组、切片原始数组。我很好奇为什么从熵/分布的角度来看这会有所帮助。

我不是专家，但我确实使用索引编写了一个示例函数来避免任何数组突变——尽管它确实添加到了 Set 中。我也不知道这是如何随机分布的，但是代码很简单，我认为在这里可以得到答案。

function sample(array, size = 1) {
  const { floor, random } = Math;
  let sampleSet = new Set();
  for (let i = 0; i < size; i++) {
    let index;
    do { index = floor(random() * array.length); }
    while (sampleSet.has(index));
    sampleSet.add(index);
  }
  return [...sampleSet].map(i => array[i]);
}

const words = [
  'confused', 'astonishing', 'mint', 'engine', 'team', 'cowardly', 'cooperative',
  'repair', 'unwritten', 'detailed', 'fortunate', 'value', 'dogs', 'air', 'found',
  'crooked', 'useless', 'treatment', 'surprise', 'hill', 'finger', 'pet',
  'adjustment', 'alleged', 'income'
];

console.log(sample(words, 4));

score 1 · Accepted Answer

您可以在选择元素时从数组的副本中删除元素。性能可能并不理想，但它可能满足您的需要：

function getRandom(arr, size) {
  var copy = arr.slice(0), rand = [];
  for (var i = 0; i < size && i < copy.length; i++) {
    var index = Math.floor(Math.random() * copy.length);
    rand.push(copy.splice(index, 1)[0]);
  }
  return rand;
}

score 0 · Accepted Answer

对于非常大的数组，使用索引而不是数组成员更有效。

这就是我在此页面上没有找到我喜欢的任何内容后最终得到的结果。

/**
 * Get a random subset of an array
 * @param {Array} arr - Array to take a smaple of.
 * @param {Number} sample_size - Size of sample to pull.
 * @param {Boolean} return_indexes - If true, return indexes rather than members
 * @returns {Array|Boolean} - An array containing random a subset of the members or indexes.
 */
function getArraySample(arr, sample_size, return_indexes = false) {
    if(sample_size > arr.length) return false;
    const sample_idxs = [];
    const randomIndex = () => Math.floor(Math.random() * arr.length);
    while(sample_size > sample_idxs.length){
        let idx = randomIndex();
        while(sample_idxs.includes(idx)) idx = randomIndex();
        sample_idxs.push(idx);
    }
    sample_idxs.sort((a, b) => a > b ? 1 : -1);
    if(return_indexes) return sample_idxs;
    return sample_idxs.map(i => arr[i]);
}

score 0 · Accepted Answer

我的方法是创建一个getRandomIndexes方法，您可以使用该方法创建一个索引数组，您将从主数组中提取这些索引。在这种情况下，我添加了一个简单的逻辑来避免示例中的相同索引。这就是它的工作原理

const getRandomIndexes = (length, size) => {
  const indexes = [];
  const created = {};

  while (indexes.length < size) {
    const random = Math.floor(Math.random() * length);
    if (!created[random]) {
      indexes.push(random);
      created[random] = true;
    }
  }
  return indexes;
};

此函数独立于您所拥有的任何内容，将为您提供一个索引数组，您可以使用这些索引从您的长度数组中提取值length，因此可以通过以下方式进行采样

const myArray = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

getRandomIndexes(myArray.length, 3).map(i => myArray[i])

每次调用该方法时，您都会得到不同的myArray. 在这一点上，这个解决方案很酷，但可以更好地采样不同的尺寸。如果你想这样做，你可以使用

getRandomIndexes(myArray.length, Math.ceil(Math.random() * 6)).map(i => myArray[i])

每次调用它都会给你一个从 1-6 不同的样本大小。

我希望这有帮助：D

score 0 · Accepted Answer

D3-array的shuffle使用 Fisher-Yeates shuffle 算法对数组进行随机重新排序。这是一个变异函数——意味着原始数组被重新排序，这有利于性能。

D3 用于浏览器 - 与节点一起使用更复杂。

https://github.com/d3/d3-array#shuffle

npm install d3-array

    //import {shuffle} from "d3-array" 
    
    let x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];

    d3.shuffle(x)

    console.log(x) // it is shuffled

<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/5.0.0/d3.min.js"></script>

如果您不想改变原始数组

    let x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15];

    let shuffled_x = d3.shuffle(x.slice()) //calling slice with no parameters returns a copy of the original array

    console.log(x) // not shuffled
    console.log(shuffled_x)

<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/5.0.0/d3.min.js"></script>

score -1 · Accepted Answer

Underscore.js大约 70kb。如果你不需要所有额外的废话，rando.js只有大约 2kb（小 97%），它的工作方式如下：

console.log(randoSequence([8, 6, 7, 5, 3, 0, 9]).slice(-5));

<script src="https://randojs.com/2.0.0.js"></script>

您可以看到默认情况下它会跟踪原始索引，以防两个值相同但您仍然关心选择了哪一个。如果您不需要这些，您可以添加一个地图，如下所示：

console.log(randoSequence([8, 6, 7, 5, 3, 0, 9]).slice(-5).map((i) => i.value));

<script src="https://randojs.com/2.0.0.js"></script>

javascript - 从数组中采样一个随机子集

15 回答 15

更随机

无更换

无需更换，更随机

快点...

洛达什 v4

洛达什 v3

Related

Reference