From 8b36e2f00bdab070675a8d97466ca971973c699a Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Tue, 9 Feb 2016 07:24:45 -0800 Subject: [PATCH] Add sample(_) and sample(_,_) to Random. --- doc/site/modules/random/random.markdown | 15 ++++++ src/optional/wren_opt_random.wren | 61 +++++++++++++++++++++++++ src/optional/wren_opt_random.wren.inc | 61 +++++++++++++++++++++++++ test/random/sample_count_all.wren | 19 ++++++++ test/random/sample_count_multiple.wren | 19 ++++++++ test/random/sample_count_one.wren | 23 ++++++++++ test/random/sample_count_too_many.wren | 5 ++ test/random/sample_count_zero.wren | 6 +++ test/random/sample_one.wren | 20 ++++++++ test/random/sample_one_empty.wren | 5 ++ test/random/shuffle.wren | 19 ++++++-- 11 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 test/random/sample_count_all.wren create mode 100644 test/random/sample_count_multiple.wren create mode 100644 test/random/sample_count_one.wren create mode 100644 test/random/sample_count_too_many.wren create mode 100644 test/random/sample_count_zero.wren create mode 100644 test/random/sample_one.wren create mode 100644 test/random/sample_one_empty.wren diff --git a/doc/site/modules/random/random.markdown b/doc/site/modules/random/random.markdown index 7133ab94..26092275 100644 --- a/doc/site/modules/random/random.markdown +++ b/doc/site/modules/random/random.markdown @@ -94,6 +94,21 @@ Returns an integer between `start` and `end`, including `start` but excluding System.print(random.int(-10, 10)) //> -6 System.print(random.int(-4, 2)) //> -2 +### **sample**(list) + +Selects a random element from `list`. + +### **sample**(list, count) + +Samples `count` randomly chosen unique elements from `list`. + +This uses "random without replacement" sampling—no index in the list will +be selected more than once. + +Returns a new list of the selected elements. + +It is an error if `count` is greater than the number of elements in the list. + ### **shuffle**(list) Randomly shuffles the elements in `list`. The items are randomly re-ordered in diff --git a/src/optional/wren_opt_random.wren b/src/optional/wren_opt_random.wren index 17430f58..829337e3 100644 --- a/src/optional/wren_opt_random.wren +++ b/src/optional/wren_opt_random.wren @@ -47,6 +47,67 @@ foreign class Random { int(end) { (float() * end).floor } int(start, end) { (float() * (end - start)).floor + start } + sample(list) { sample(list, 1)[0] } + sample(list, count) { + if (count > list.count) Fiber.abort("Not enough elements to sample.") + + // There at (at least) two simple algorithms for choosing a number of + // samples from a list without replacement -- where we don't pick the same + // element more than once. + // + // The first is faster when the number of samples is small relative to the + // size of the collection. In many cases, it avoids scanning the entire + // list. In the common case of just wanting one sample, it's a single + // random index lookup. + // + // However, its performance degrades badly as the sample size increases. + // Vitter's algorithm always scans the entire list, but it's also always + // O(n). + // + // The cutoff point between the two follows a quadratic curve on the same + // size. Based on some empirical testing, scaling that by 5 seems to fit + // pretty closely and chooses the fastest one for the given sample and + // collection size. + if (count * count * 5 < list.count) { + // Pick random elements and retry if you hit a previously chosen one. + var picked = {} + var result = [] + for (i in 0...count) { + // Find an index that we haven't already selected. + var index + while (true) { + index = int(count) + if (!picked.containsKey(index)) break + } + + picked[index] = true + result.add(list[index]) + } + + return result + } else { + // Jeffrey Vitter's Algorithm R. + + // Fill the reservoir with the first elements in the list. + var result = list[0...count] + + // We want to ensure the results are always in random order, so shuffle + // them. In cases where the sample size is the entire collection, this + // devolves to running Fisher-Yates on a copy of the list. + shuffle(result) + + // Now walk the rest of the list. For each element, randomly consider + // replacing one of the reservoir elements with it. The probability here + // works out such that it does this uniformly. + for (i in count...list.count) { + var slot = int(0, i + 1) + if (slot < count) result[slot] = list[i] + } + + return result + } + } + shuffle(list) { if (list.isEmpty) return diff --git a/src/optional/wren_opt_random.wren.inc b/src/optional/wren_opt_random.wren.inc index d730d3ac..ff217a4f 100644 --- a/src/optional/wren_opt_random.wren.inc +++ b/src/optional/wren_opt_random.wren.inc @@ -49,6 +49,67 @@ static const char* randomModuleSource = " int(end) { (float() * end).floor }\n" " int(start, end) { (float() * (end - start)).floor + start }\n" "\n" +" sample(list) { sample(list, 1)[0] }\n" +" sample(list, count) {\n" +" if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n" +"\n" +" // There at (at least) two simple algorithms for choosing a number of\n" +" // samples from a list without replacement -- where we don't pick the same\n" +" // element more than once.\n" +" //\n" +" // The first is faster when the number of samples is small relative to the\n" +" // size of the collection. In many cases, it avoids scanning the entire\n" +" // list. In the common case of just wanting one sample, it's a single\n" +" // random index lookup.\n" +" //\n" +" // However, its performance degrades badly as the sample size increases.\n" +" // Vitter's algorithm always scans the entire list, but it's also always\n" +" // O(n).\n" +" //\n" +" // The cutoff point between the two follows a quadratic curve on the same\n" +" // size. Based on some empirical testing, scaling that by 5 seems to fit\n" +" // pretty closely and chooses the fastest one for the given sample and\n" +" // collection size.\n" +" if (count * count * 5 < list.count) {\n" +" // Pick random elements and retry if you hit a previously chosen one.\n" +" var picked = {}\n" +" var result = []\n" +" for (i in 0...count) {\n" +" // Find an index that we haven't already selected.\n" +" var index\n" +" while (true) {\n" +" index = int(count)\n" +" if (!picked.containsKey(index)) break\n" +" }\n" +"\n" +" picked[index] = true\n" +" result.add(list[index])\n" +" }\n" +"\n" +" return result\n" +" } else {\n" +" // Jeffrey Vitter's Algorithm R.\n" +"\n" +" // Fill the reservoir with the first elements in the list.\n" +" var result = list[0...count]\n" +"\n" +" // We want to ensure the results are always in random order, so shuffle\n" +" // them. In cases where the sample size is the entire collection, this\n" +" // devolves to running Fisher-Yates on a copy of the list.\n" +" shuffle(result)\n" +"\n" +" // Now walk the rest of the list. For each element, randomly consider\n" +" // replacing one of the reservoir elements with it. The probability here\n" +" // works out such that it does this uniformly.\n" +" for (i in count...list.count) {\n" +" var slot = int(0, i + 1)\n" +" if (slot < count) result[slot] = list[i]\n" +" }\n" +"\n" +" return result\n" +" }\n" +" }\n" +"\n" " shuffle(list) {\n" " if (list.isEmpty) return\n" "\n" diff --git a/test/random/sample_count_all.wren b/test/random/sample_count_all.wren new file mode 100644 index 00000000..fb8383ca --- /dev/null +++ b/test/random/sample_count_all.wren @@ -0,0 +1,19 @@ +import "random" for Random + +var random = Random.new(12345) + +// Should choose all elements with roughly equal probability. +var list = ["a", "b", "c"] +var histogram = {} +for (i in 1..5000) { + var sample = random.sample(list, 3) + var string = sample.toString + if (!histogram.containsKey(string)) histogram[string] = 0 + histogram[string] = histogram[string] + 1 +} + +System.print(histogram.count) // expect: 6 +for (key in histogram.keys) { + var error = (histogram[key] / (5000 / 6) - 1).abs + if (error > 0.1) System.print("!!! %(error)") +} diff --git a/test/random/sample_count_multiple.wren b/test/random/sample_count_multiple.wren new file mode 100644 index 00000000..a330f355 --- /dev/null +++ b/test/random/sample_count_multiple.wren @@ -0,0 +1,19 @@ +import "random" for Random + +var random = Random.new(12345) + +// Should choose all elements with roughly equal probability. +var list = ["a", "b", "c", "d"] +var histogram = {} +for (i in 1..5000) { + var sample = random.sample(list, 3) + var string = sample.toString + if (!histogram.containsKey(string)) histogram[string] = 0 + histogram[string] = histogram[string] + 1 +} + +System.print(histogram.count) // expect: 24 +for (key in histogram.keys) { + var error = (histogram[key] / (5000 / 24) - 1).abs + if (error > 0.2) System.print("!!! %(error)") +} diff --git a/test/random/sample_count_one.wren b/test/random/sample_count_one.wren new file mode 100644 index 00000000..6f2feb4e --- /dev/null +++ b/test/random/sample_count_one.wren @@ -0,0 +1,23 @@ +import "random" for Random + +var random = Random.new(12345) + +// Single element list. +System.print(random.sample(["single"], 1)) // expect: [single] + +// Should choose all elements with roughly equal probability. +var list = ["a", "b", "c", "d", "e"] +var histogram = {} +for (i in 1..5000) { + var sample = random.sample(list, 1) + + var string = sample.toString + if (!histogram.containsKey(string)) histogram[string] = 0 + histogram[string] = histogram[string] + 1 +} + +System.print(histogram.count) // expect: 5 +for (key in histogram.keys) { + var error = (histogram[key] / (5000 / list.count) - 1).abs + if (error > 0.1) System.print("!!! %(error)") +} diff --git a/test/random/sample_count_too_many.wren b/test/random/sample_count_too_many.wren new file mode 100644 index 00000000..19b805ca --- /dev/null +++ b/test/random/sample_count_too_many.wren @@ -0,0 +1,5 @@ +import "random" for Random + +var random = Random.new(12345) + +random.sample([1, 2, 3], 4) // expect runtime error: Not enough elements to sample. diff --git a/test/random/sample_count_zero.wren b/test/random/sample_count_zero.wren new file mode 100644 index 00000000..ca652c43 --- /dev/null +++ b/test/random/sample_count_zero.wren @@ -0,0 +1,6 @@ +import "random" for Random + +var random = Random.new(12345) + +System.print(random.sample([], 0)) // expect: [] +System.print(random.sample([1, 2, 3], 0)) // expect: [] diff --git a/test/random/sample_one.wren b/test/random/sample_one.wren new file mode 100644 index 00000000..dd7fe9bb --- /dev/null +++ b/test/random/sample_one.wren @@ -0,0 +1,20 @@ +import "random" for Random + +var random = Random.new(12345) + +// Single element list. +System.print(random.sample(["single"])) // expect: single + +// Should choose all elements with roughly equal probability. +var list = ["a", "b", "c", "d", "e"] +var histogram = {"a": 0, "b": 0, "c": 0, "d": 0, "e": 0} +for (i in 1..1000) { + var sample = random.sample(list) + histogram[sample] = histogram[sample] + 1 +} + +System.print(histogram.count) // expect: 5 +for (key in histogram.keys) { + var error = (histogram[key] / (1000 / list.count) - 1).abs + if (error > 0.2) System.print("!!! %(error)") +} diff --git a/test/random/sample_one_empty.wren b/test/random/sample_one_empty.wren new file mode 100644 index 00000000..b557534e --- /dev/null +++ b/test/random/sample_one_empty.wren @@ -0,0 +1,5 @@ +import "random" for Random + +var random = Random.new(12345) + +random.sample([]) // expect runtime error: Not enough elements to sample. diff --git a/test/random/shuffle.wren b/test/random/shuffle.wren index b3f3e58c..e4dccbbe 100644 --- a/test/random/shuffle.wren +++ b/test/random/shuffle.wren @@ -12,12 +12,21 @@ list = [1] random.shuffle(list) System.print(list) // expect: [1] -// Given enough tries, should generate all permutations. -var hits = {} -for (i in 1..200) { +// Given enough tries, should generate all permutations with roughly equal +// probability. +var histogram = {} +for (i in 1..5000) { var list = [1, 2, 3, 4] random.shuffle(list) - hits[list.toString] = true + + var string = list.toString + if (!histogram.containsKey(string)) histogram[string] = 0 + histogram[string] = histogram[string] + 1 +} + +System.print(histogram.count) // expect: 24 +for (key in histogram.keys) { + var error = (histogram[key] / (5000 / 24) - 1).abs + if (error > 0.2) System.print("!!! %(error)") } -System.print(hits.count) // expect: 24