From f3493d049930a004bf0ee24c2ccc02f8ba140b3a Mon Sep 17 00:00:00 2001
From: kawa-yoiko <45892908+kawa-yoiko@users.noreply.github.com>
Date: Sun, 14 Jun 2020 12:31:23 +0800
Subject: [PATCH] Optimize Random.sample(_, _) for performance (#716)

* Optimize Random.sample(_, _) for performance
* Make tests treat random samples as unordered
* Test all sample sizes possible
* Tweak random sampling algorithm for performance
---
 src/optional/wren_opt_random.wren      | 71 ++++++++------------------
 src/optional/wren_opt_random.wren.inc  | 71 ++++++++------------------
 test/random/sample_count_all.wren      | 19 -------
 test/random/sample_count_multiple.wren | 32 +++++++-----
 4 files changed, 64 insertions(+), 129 deletions(-)
 delete mode 100644 test/random/sample_count_all.wren

diff --git a/src/optional/wren_opt_random.wren b/src/optional/wren_opt_random.wren
index b94f53b3..a7813b9f 100644
--- a/src/optional/wren_opt_random.wren
+++ b/src/optional/wren_opt_random.wren
@@ -47,65 +47,38 @@ foreign class Random {
   int(end) { (float() * end).floor }
   int(start, end) { (float() * (end - start)).floor + start }
 
-  sample(list) { sample(list, 1)[0] }
+  sample(list) {
+    if (list.count == 0) Fiber.abort("Not enough elements to sample.")
+    return list[int(list.count)]
+  }
   sample(list, count) {
     if (count > list.count) Fiber.abort("Not enough elements to sample.")
 
-    // There are (at least) two simple algorithms for choosing a number of
-    // samples from a list without replacement -- where we don't pick the same
-    // element more than once.
-    //
-    // The first is faster when the number of samples is small relative to the
-    // size of the collection. In many cases, it avoids scanning the entire
-    // list. In the common case of just wanting one sample, it's a single
-    // random index lookup.
-    //
-    // However, its performance degrades badly as the sample size increases.
-    // Vitter's algorithm always scans the entire list, but it's also always
-    // O(n).
-    //
-    // The cutoff point between the two follows a quadratic curve on the same
-    // size. Based on some empirical testing, scaling that by 5 seems to fit
-    // pretty closely and chooses the fastest one for the given sample and
-    // collection size.
-    if (count * count * 5 < list.count) {
-      // Pick random elements and retry if you hit a previously chosen one.
-      var picked = {}
-      var result = []
-      for (i in 0...count) {
-        // Find an index that we haven't already selected.
-        var index
-        while (true) {
-          index = int(list.count)
-          if (!picked.containsKey(index)) break
-        }
+    var result = []
 
+    // The algorithm described in "Programming pearls: a sample of brilliance".
+    // Use a hash map for sample sizes less than 1/4 of the population size and
+    // an array of booleans for larger samples. This simple heuristic improves
+    // performance for large sample sizes as well as reduces memory usage.
+    if (count * 4 < list.count) {
+      var picked = {}
+      for (i in list.count - count...list.count) {
+        var index = int(i + 1)
+        if (picked.containsKey(index)) index = i
         picked[index] = true
         result.add(list[index])
       }
-
-      return result
     } else {
-      // Jeffrey Vitter's Algorithm R.
-
-      // Fill the reservoir with the first elements in the list.
-      var result = list[0...count]
-
-      // We want to ensure the results are always in random order, so shuffle
-      // them. In cases where the sample size is the entire collection, this
-      // devolves to running Fisher-Yates on a copy of the list.
-      shuffle(result)
-
-      // Now walk the rest of the list. For each element, randomly consider
-      // replacing one of the reservoir elements with it. The probability here
-      // works out such that it does this uniformly.
-      for (i in count...list.count) {
-        var slot = int(0, i + 1)
-        if (slot < count) result[slot] = list[i]
+      var picked = List.filled(list.count, false)
+      for (i in list.count - count...list.count) {
+        var index = int(i + 1)
+        if (picked[index]) index = i
+        picked[index] = true
+        result.add(list[index])
       }
-
-      return result
     }
+
+    return result
   }
 
   shuffle(list) {
diff --git a/src/optional/wren_opt_random.wren.inc b/src/optional/wren_opt_random.wren.inc
index 4ccbc5e9..164f9d58 100644
--- a/src/optional/wren_opt_random.wren.inc
+++ b/src/optional/wren_opt_random.wren.inc
@@ -49,65 +49,38 @@ static const char* randomModuleSource =
 "  int(end) { (float() * end).floor }\n"
 "  int(start, end) { (float() * (end - start)).floor + start }\n"
 "\n"
-"  sample(list) { sample(list, 1)[0] }\n"
+"  sample(list) {\n"
+"    if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n"
+"    return list[int(list.count)]\n"
+"  }\n"
 "  sample(list, count) {\n"
 "    if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
 "\n"
-"    // There are (at least) two simple algorithms for choosing a number of\n"
-"    // samples from a list without replacement -- where we don't pick the same\n"
-"    // element more than once.\n"
-"    //\n"
-"    // The first is faster when the number of samples is small relative to the\n"
-"    // size of the collection. In many cases, it avoids scanning the entire\n"
-"    // list. In the common case of just wanting one sample, it's a single\n"
-"    // random index lookup.\n"
-"    //\n"
-"    // However, its performance degrades badly as the sample size increases.\n"
-"    // Vitter's algorithm always scans the entire list, but it's also always\n"
-"    // O(n).\n"
-"    //\n"
-"    // The cutoff point between the two follows a quadratic curve on the same\n"
-"    // size. Based on some empirical testing, scaling that by 5 seems to fit\n"
-"    // pretty closely and chooses the fastest one for the given sample and\n"
-"    // collection size.\n"
-"    if (count * count * 5 < list.count) {\n"
-"      // Pick random elements and retry if you hit a previously chosen one.\n"
-"      var picked = {}\n"
-"      var result = []\n"
-"      for (i in 0...count) {\n"
-"        // Find an index that we haven't already selected.\n"
-"        var index\n"
-"        while (true) {\n"
-"          index = int(list.count)\n"
-"          if (!picked.containsKey(index)) break\n"
-"        }\n"
+"    var result = []\n"
 "\n"
+"    // The algorithm described in \"Programming pearls: a sample of brilliance\".\n"
+"    // Use a hash map for sample sizes less than 1/4 of the population size and\n"
+"    // an array of booleans for larger samples. This simple heuristic improves\n"
+"    // performance for large sample sizes as well as reduces memory usage.\n"
+"    if (count * 4 < list.count) {\n"
+"      var picked = {}\n"
+"      for (i in list.count - count...list.count) {\n"
+"        var index = int(i + 1)\n"
+"        if (picked.containsKey(index)) index = i\n"
 "        picked[index] = true\n"
 "        result.add(list[index])\n"
 "      }\n"
-"\n"
-"      return result\n"
 "    } else {\n"
-"      // Jeffrey Vitter's Algorithm R.\n"
-"\n"
-"      // Fill the reservoir with the first elements in the list.\n"
-"      var result = list[0...count]\n"
-"\n"
-"      // We want to ensure the results are always in random order, so shuffle\n"
-"      // them. In cases where the sample size is the entire collection, this\n"
-"      // devolves to running Fisher-Yates on a copy of the list.\n"
-"      shuffle(result)\n"
-"\n"
-"      // Now walk the rest of the list. For each element, randomly consider\n"
-"      // replacing one of the reservoir elements with it. The probability here\n"
-"      // works out such that it does this uniformly.\n"
-"      for (i in count...list.count) {\n"
-"        var slot = int(0, i + 1)\n"
-"        if (slot < count) result[slot] = list[i]\n"
+"      var picked = List.filled(list.count, false)\n"
+"      for (i in list.count - count...list.count) {\n"
+"        var index = int(i + 1)\n"
+"        if (picked[index]) index = i\n"
+"        picked[index] = true\n"
+"        result.add(list[index])\n"
 "      }\n"
-"\n"
-"      return result\n"
 "    }\n"
+"\n"
+"    return result\n"
 "  }\n"
 "\n"
 "  shuffle(list) {\n"
diff --git a/test/random/sample_count_all.wren b/test/random/sample_count_all.wren
deleted file mode 100644
index fb8383ca..00000000
--- a/test/random/sample_count_all.wren
+++ /dev/null
@@ -1,19 +0,0 @@
-import "random" for Random
-
-var random = Random.new(12345)
-
-// Should choose all elements with roughly equal probability.
-var list = ["a", "b", "c"]
-var histogram = {}
-for (i in 1..5000) {
-  var sample = random.sample(list, 3)
-  var string = sample.toString
-  if (!histogram.containsKey(string)) histogram[string] = 0
-  histogram[string] = histogram[string] + 1
-}
-
-System.print(histogram.count) // expect: 6
-for (key in histogram.keys) {
-  var error = (histogram[key] / (5000 / 6) - 1).abs
-  if (error > 0.1) System.print("!!! %(error)")
-}
diff --git a/test/random/sample_count_multiple.wren b/test/random/sample_count_multiple.wren
index a330f355..2dd34753 100644
--- a/test/random/sample_count_multiple.wren
+++ b/test/random/sample_count_multiple.wren
@@ -3,17 +3,25 @@ import "random" for Random
 var random = Random.new(12345)
 
 // Should choose all elements with roughly equal probability.
-var list = ["a", "b", "c", "d"]
-var histogram = {}
-for (i in 1..5000) {
-  var sample = random.sample(list, 3)
-  var string = sample.toString
-  if (!histogram.containsKey(string)) histogram[string] = 0
-  histogram[string] = histogram[string] + 1
-}
+var list = (0...10).toList
+var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1]
 
-System.print(histogram.count) // expect: 24
-for (key in histogram.keys) {
-  var error = (histogram[key] / (5000 / 24) - 1).abs
-  if (error > 0.2) System.print("!!! %(error)")
+for (k in 0..10) {
+  var count = binom[k]
+
+  var histogram = {}
+  for (i in 1..count * 100) {
+    var sample = random.sample(list, k)
+    // Create a bitmask to represent the unordered set.
+    var bitmask = 0
+    sample.each {|s| bitmask = bitmask | (1 << s) }
+    if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0
+    histogram[bitmask] = histogram[bitmask] + 1
+  }
+
+  if (histogram.count != count) System.print("!!! %(count) %(histogram.count)")
+  for (key in histogram.keys) {
+    var error = (histogram[key] - 100).abs
+    if (error > 50) System.print("!!! %(error)")
+  }
 }