From f3493d049930a004bf0ee24c2ccc02f8ba140b3a Mon Sep 17 00:00:00 2001 From: kawa-yoiko <45892908+kawa-yoiko@users.noreply.github.com> Date: Sun, 14 Jun 2020 12:31:23 +0800 Subject: [PATCH] Optimize Random.sample(_, _) for performance (#716) * Optimize Random.sample(_, _) for performance * Make tests treat random samples as unordered * Test all sample sizes possible * Tweak random sampling algorithm for performance --- src/optional/wren_opt_random.wren | 71 ++++++++------------------ src/optional/wren_opt_random.wren.inc | 71 ++++++++------------------ test/random/sample_count_all.wren | 19 ------- test/random/sample_count_multiple.wren | 32 +++++++----- 4 files changed, 64 insertions(+), 129 deletions(-) delete mode 100644 test/random/sample_count_all.wren diff --git a/src/optional/wren_opt_random.wren b/src/optional/wren_opt_random.wren index b94f53b3..a7813b9f 100644 --- a/src/optional/wren_opt_random.wren +++ b/src/optional/wren_opt_random.wren @@ -47,65 +47,38 @@ foreign class Random { int(end) { (float() * end).floor } int(start, end) { (float() * (end - start)).floor + start } - sample(list) { sample(list, 1)[0] } + sample(list) { + if (list.count == 0) Fiber.abort("Not enough elements to sample.") + return list[int(list.count)] + } sample(list, count) { if (count > list.count) Fiber.abort("Not enough elements to sample.") - // There are (at least) two simple algorithms for choosing a number of - // samples from a list without replacement -- where we don't pick the same - // element more than once. - // - // The first is faster when the number of samples is small relative to the - // size of the collection. In many cases, it avoids scanning the entire - // list. In the common case of just wanting one sample, it's a single - // random index lookup. - // - // However, its performance degrades badly as the sample size increases. - // Vitter's algorithm always scans the entire list, but it's also always - // O(n). - // - // The cutoff point between the two follows a quadratic curve on the same - // size. Based on some empirical testing, scaling that by 5 seems to fit - // pretty closely and chooses the fastest one for the given sample and - // collection size. - if (count * count * 5 < list.count) { - // Pick random elements and retry if you hit a previously chosen one. - var picked = {} - var result = [] - for (i in 0...count) { - // Find an index that we haven't already selected. - var index - while (true) { - index = int(list.count) - if (!picked.containsKey(index)) break - } + var result = [] + // The algorithm described in "Programming pearls: a sample of brilliance". + // Use a hash map for sample sizes less than 1/4 of the population size and + // an array of booleans for larger samples. This simple heuristic improves + // performance for large sample sizes as well as reduces memory usage. + if (count * 4 < list.count) { + var picked = {} + for (i in list.count - count...list.count) { + var index = int(i + 1) + if (picked.containsKey(index)) index = i picked[index] = true result.add(list[index]) } - - return result } else { - // Jeffrey Vitter's Algorithm R. - - // Fill the reservoir with the first elements in the list. - var result = list[0...count] - - // We want to ensure the results are always in random order, so shuffle - // them. In cases where the sample size is the entire collection, this - // devolves to running Fisher-Yates on a copy of the list. - shuffle(result) - - // Now walk the rest of the list. For each element, randomly consider - // replacing one of the reservoir elements with it. The probability here - // works out such that it does this uniformly. - for (i in count...list.count) { - var slot = int(0, i + 1) - if (slot < count) result[slot] = list[i] + var picked = List.filled(list.count, false) + for (i in list.count - count...list.count) { + var index = int(i + 1) + if (picked[index]) index = i + picked[index] = true + result.add(list[index]) } - - return result } + + return result } shuffle(list) { diff --git a/src/optional/wren_opt_random.wren.inc b/src/optional/wren_opt_random.wren.inc index 4ccbc5e9..164f9d58 100644 --- a/src/optional/wren_opt_random.wren.inc +++ b/src/optional/wren_opt_random.wren.inc @@ -49,65 +49,38 @@ static const char* randomModuleSource = " int(end) { (float() * end).floor }\n" " int(start, end) { (float() * (end - start)).floor + start }\n" "\n" -" sample(list) { sample(list, 1)[0] }\n" +" sample(list) {\n" +" if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n" +" return list[int(list.count)]\n" +" }\n" " sample(list, count) {\n" " if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n" "\n" -" // There are (at least) two simple algorithms for choosing a number of\n" -" // samples from a list without replacement -- where we don't pick the same\n" -" // element more than once.\n" -" //\n" -" // The first is faster when the number of samples is small relative to the\n" -" // size of the collection. In many cases, it avoids scanning the entire\n" -" // list. In the common case of just wanting one sample, it's a single\n" -" // random index lookup.\n" -" //\n" -" // However, its performance degrades badly as the sample size increases.\n" -" // Vitter's algorithm always scans the entire list, but it's also always\n" -" // O(n).\n" -" //\n" -" // The cutoff point between the two follows a quadratic curve on the same\n" -" // size. Based on some empirical testing, scaling that by 5 seems to fit\n" -" // pretty closely and chooses the fastest one for the given sample and\n" -" // collection size.\n" -" if (count * count * 5 < list.count) {\n" -" // Pick random elements and retry if you hit a previously chosen one.\n" -" var picked = {}\n" -" var result = []\n" -" for (i in 0...count) {\n" -" // Find an index that we haven't already selected.\n" -" var index\n" -" while (true) {\n" -" index = int(list.count)\n" -" if (!picked.containsKey(index)) break\n" -" }\n" +" var result = []\n" "\n" +" // The algorithm described in \"Programming pearls: a sample of brilliance\".\n" +" // Use a hash map for sample sizes less than 1/4 of the population size and\n" +" // an array of booleans for larger samples. This simple heuristic improves\n" +" // performance for large sample sizes as well as reduces memory usage.\n" +" if (count * 4 < list.count) {\n" +" var picked = {}\n" +" for (i in list.count - count...list.count) {\n" +" var index = int(i + 1)\n" +" if (picked.containsKey(index)) index = i\n" " picked[index] = true\n" " result.add(list[index])\n" " }\n" -"\n" -" return result\n" " } else {\n" -" // Jeffrey Vitter's Algorithm R.\n" -"\n" -" // Fill the reservoir with the first elements in the list.\n" -" var result = list[0...count]\n" -"\n" -" // We want to ensure the results are always in random order, so shuffle\n" -" // them. In cases where the sample size is the entire collection, this\n" -" // devolves to running Fisher-Yates on a copy of the list.\n" -" shuffle(result)\n" -"\n" -" // Now walk the rest of the list. For each element, randomly consider\n" -" // replacing one of the reservoir elements with it. The probability here\n" -" // works out such that it does this uniformly.\n" -" for (i in count...list.count) {\n" -" var slot = int(0, i + 1)\n" -" if (slot < count) result[slot] = list[i]\n" +" var picked = List.filled(list.count, false)\n" +" for (i in list.count - count...list.count) {\n" +" var index = int(i + 1)\n" +" if (picked[index]) index = i\n" +" picked[index] = true\n" +" result.add(list[index])\n" " }\n" -"\n" -" return result\n" " }\n" +"\n" +" return result\n" " }\n" "\n" " shuffle(list) {\n" diff --git a/test/random/sample_count_all.wren b/test/random/sample_count_all.wren deleted file mode 100644 index fb8383ca..00000000 --- a/test/random/sample_count_all.wren +++ /dev/null @@ -1,19 +0,0 @@ -import "random" for Random - -var random = Random.new(12345) - -// Should choose all elements with roughly equal probability. -var list = ["a", "b", "c"] -var histogram = {} -for (i in 1..5000) { - var sample = random.sample(list, 3) - var string = sample.toString - if (!histogram.containsKey(string)) histogram[string] = 0 - histogram[string] = histogram[string] + 1 -} - -System.print(histogram.count) // expect: 6 -for (key in histogram.keys) { - var error = (histogram[key] / (5000 / 6) - 1).abs - if (error > 0.1) System.print("!!! %(error)") -} diff --git a/test/random/sample_count_multiple.wren b/test/random/sample_count_multiple.wren index a330f355..2dd34753 100644 --- a/test/random/sample_count_multiple.wren +++ b/test/random/sample_count_multiple.wren @@ -3,17 +3,25 @@ import "random" for Random var random = Random.new(12345) // Should choose all elements with roughly equal probability. -var list = ["a", "b", "c", "d"] -var histogram = {} -for (i in 1..5000) { - var sample = random.sample(list, 3) - var string = sample.toString - if (!histogram.containsKey(string)) histogram[string] = 0 - histogram[string] = histogram[string] + 1 -} +var list = (0...10).toList +var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1] -System.print(histogram.count) // expect: 24 -for (key in histogram.keys) { - var error = (histogram[key] / (5000 / 24) - 1).abs - if (error > 0.2) System.print("!!! %(error)") +for (k in 0..10) { + var count = binom[k] + + var histogram = {} + for (i in 1..count * 100) { + var sample = random.sample(list, k) + // Create a bitmask to represent the unordered set. + var bitmask = 0 + sample.each {|s| bitmask = bitmask | (1 << s) } + if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0 + histogram[bitmask] = histogram[bitmask] + 1 + } + + if (histogram.count != count) System.print("!!! %(count) %(histogram.count)") + for (key in histogram.keys) { + var error = (histogram[key] - 100).abs + if (error > 50) System.print("!!! %(error)") + } }