Optimize Random.sample(_, _) for performance (#716)

* Optimize Random.sample(_, _) for performance * Make tests treat random samples as unordered * Test all sample sizes possible * Tweak random sampling algorithm for performance
2026-01-11 14:18:42 +01:00 · 2020-06-14 12:31:23 +08:00
parent fea0dfafa0
commit f3493d0499
4 changed files with 64 additions and 129 deletions
--- a/src/optional/wren_opt_random.wren
+++ b/src/optional/wren_opt_random.wren
@ -47,65 +47,38 @@ foreign class Random {
  int(end) { (float() * end).floor }
  int(start, end) { (float() * (end - start)).floor + start }
-  sample(list) { sample(list, 1)[0] }
+  sample(list) {
    if (list.count == 0) Fiber.abort("Not enough elements to sample.")
    return list[int(list.count)]
  }
  sample(list, count) {
    if (count > list.count) Fiber.abort("Not enough elements to sample.")
-    // There are (at least) two simple algorithms for choosing a number of
+    var result = []
    // samples from a list without replacement -- where we don't pick the same
    // element more than once.
    //
    // The first is faster when the number of samples is small relative to the
    // size of the collection. In many cases, it avoids scanning the entire
    // list. In the common case of just wanting one sample, it's a single
    // random index lookup.
    //
    // However, its performance degrades badly as the sample size increases.
    // Vitter's algorithm always scans the entire list, but it's also always
    // O(n).
    //
    // The cutoff point between the two follows a quadratic curve on the same
    // size. Based on some empirical testing, scaling that by 5 seems to fit
    // pretty closely and chooses the fastest one for the given sample and
    // collection size.
    if (count * count * 5 < list.count) {
      // Pick random elements and retry if you hit a previously chosen one.
      var picked = {}
      var result = []
      for (i in 0...count) {
        // Find an index that we haven't already selected.
        var index
        while (true) {
          index = int(list.count)
          if (!picked.containsKey(index)) break
        }
    // The algorithm described in "Programming pearls: a sample of brilliance".
    // Use a hash map for sample sizes less than 1/4 of the population size and
    // an array of booleans for larger samples. This simple heuristic improves
    // performance for large sample sizes as well as reduces memory usage.
    if (count * 4 < list.count) {
      var picked = {}
      for (i in list.count - count...list.count) {
        var index = int(i + 1)
        if (picked.containsKey(index)) index = i
        picked[index] = true
        result.add(list[index])
      }
      return result
    } else {
-      // Jeffrey Vitter's Algorithm R.
+      var picked = List.filled(list.count, false)
-
+      for (i in list.count - count...list.count) {
-      // Fill the reservoir with the first elements in the list.
+        var index = int(i + 1)
-      var result = list[0...count]
+        if (picked[index]) index = i
-
+        picked[index] = true
-      // We want to ensure the results are always in random order, so shuffle
+        result.add(list[index])
      // them. In cases where the sample size is the entire collection, this
      // devolves to running Fisher-Yates on a copy of the list.
      shuffle(result)
      // Now walk the rest of the list. For each element, randomly consider
      // replacing one of the reservoir elements with it. The probability here
      // works out such that it does this uniformly.
      for (i in count...list.count) {
        var slot = int(0, i + 1)
        if (slot < count) result[slot] = list[i]
      }
      return result
    }
    return result
  }
  shuffle(list) {
--- a/src/optional/wren_opt_random.wren.inc
+++ b/src/optional/wren_opt_random.wren.inc
@ -49,65 +49,38 @@ static const char* randomModuleSource =
 "  int(end) { (float() * end).floor }\n"
 "  int(start, end) { (float() * (end - start)).floor + start }\n"
 "\n"
-"  sample(list) { sample(list, 1)[0] }\n"
+"  sample(list) {\n"
 "    if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n"
 "    return list[int(list.count)]\n"
 "  }\n"
 "  sample(list, count) {\n"
 "    if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
 "\n"
-"    // There are (at least) two simple algorithms for choosing a number of\n"
+"    var result = []\n"
 "    // samples from a list without replacement -- where we don't pick the same\n"
 "    // element more than once.\n"
 "    //\n"
 "    // The first is faster when the number of samples is small relative to the\n"
 "    // size of the collection. In many cases, it avoids scanning the entire\n"
 "    // list. In the common case of just wanting one sample, it's a single\n"
 "    // random index lookup.\n"
 "    //\n"
 "    // However, its performance degrades badly as the sample size increases.\n"
 "    // Vitter's algorithm always scans the entire list, but it's also always\n"
 "    // O(n).\n"
 "    //\n"
 "    // The cutoff point between the two follows a quadratic curve on the same\n"
 "    // size. Based on some empirical testing, scaling that by 5 seems to fit\n"
 "    // pretty closely and chooses the fastest one for the given sample and\n"
 "    // collection size.\n"
 "    if (count * count * 5 < list.count) {\n"
 "      // Pick random elements and retry if you hit a previously chosen one.\n"
 "      var picked = {}\n"
 "      var result = []\n"
 "      for (i in 0...count) {\n"
 "        // Find an index that we haven't already selected.\n"
 "        var index\n"
 "        while (true) {\n"
 "          index = int(list.count)\n"
 "          if (!picked.containsKey(index)) break\n"
 "        }\n"
 "\n"
 "    // The algorithm described in \"Programming pearls: a sample of brilliance\".\n"
 "    // Use a hash map for sample sizes less than 1/4 of the population size and\n"
 "    // an array of booleans for larger samples. This simple heuristic improves\n"
 "    // performance for large sample sizes as well as reduces memory usage.\n"
 "    if (count * 4 < list.count) {\n"
 "      var picked = {}\n"
 "      for (i in list.count - count...list.count) {\n"
 "        var index = int(i + 1)\n"
 "        if (picked.containsKey(index)) index = i\n"
 "        picked[index] = true\n"
 "        result.add(list[index])\n"
 "      }\n"
 "\n"
 "      return result\n"
 "    } else {\n"
-"      // Jeffrey Vitter's Algorithm R.\n"
+"      var picked = List.filled(list.count, false)\n"
-"\n"
+"      for (i in list.count - count...list.count) {\n"
-"      // Fill the reservoir with the first elements in the list.\n"
+"        var index = int(i + 1)\n"
-"      var result = list[0...count]\n"
+"        if (picked[index]) index = i\n"
-"\n"
+"        picked[index] = true\n"
-"      // We want to ensure the results are always in random order, so shuffle\n"
+"        result.add(list[index])\n"
 "      // them. In cases where the sample size is the entire collection, this\n"
 "      // devolves to running Fisher-Yates on a copy of the list.\n"
 "      shuffle(result)\n"
 "\n"
 "      // Now walk the rest of the list. For each element, randomly consider\n"
 "      // replacing one of the reservoir elements with it. The probability here\n"
 "      // works out such that it does this uniformly.\n"
 "      for (i in count...list.count) {\n"
 "        var slot = int(0, i + 1)\n"
 "        if (slot < count) result[slot] = list[i]\n"
 "      }\n"
 "\n"
 "      return result\n"
 "    }\n"
 "\n"
 "    return result\n"
 "  }\n"
 "\n"
 "  shuffle(list) {\n"
--- a/test/random/sample_count_all.wren
+++ b/test/random/sample_count_all.wren
@ -1,19 +0,0 @@
 import "random" for Random
 var random = Random.new(12345)
 // Should choose all elements with roughly equal probability.
 var list = ["a", "b", "c"]
 var histogram = {}
 for (i in 1..5000) {
  var sample = random.sample(list, 3)
  var string = sample.toString
  if (!histogram.containsKey(string)) histogram[string] = 0
  histogram[string] = histogram[string] + 1
 }
 System.print(histogram.count) // expect: 6
 for (key in histogram.keys) {
  var error = (histogram[key] / (5000 / 6) - 1).abs
  if (error > 0.1) System.print("!!! %(error)")
 }
--- a/test/random/sample_count_multiple.wren
+++ b/test/random/sample_count_multiple.wren
@ -3,17 +3,25 @@ import "random" for Random
 var random = Random.new(12345)
 // Should choose all elements with roughly equal probability.
-var list = ["a", "b", "c", "d"]
+var list = (0...10).toList
-var histogram = {}
+var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1]
 for (i in 1..5000) {
  var sample = random.sample(list, 3)
  var string = sample.toString
  if (!histogram.containsKey(string)) histogram[string] = 0
  histogram[string] = histogram[string] + 1
 }
-System.print(histogram.count) // expect: 24
+for (k in 0..10) {
-for (key in histogram.keys) {
+  var count = binom[k]
-  var error = (histogram[key] / (5000 / 24) - 1).abs
+
-  if (error > 0.2) System.print("!!! %(error)")
+  var histogram = {}
  for (i in 1..count * 100) {
    var sample = random.sample(list, k)
    // Create a bitmask to represent the unordered set.
    var bitmask = 0
    sample.each {|s| bitmask = bitmask | (1 << s) }
    if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0
    histogram[bitmask] = histogram[bitmask] + 1
  }
  if (histogram.count != count) System.print("!!! %(count) %(histogram.count)")
  for (key in histogram.keys) {
    var error = (histogram[key] - 100).abs
    if (error > 50) System.print("!!! %(error)")
  }
 }