mirror of
https://github.com/wren-lang/wren.git
synced 2026-01-11 14:18:42 +01:00
Optimize Random.sample(_, _) for performance (#716)
* Optimize Random.sample(_, _) for performance * Make tests treat random samples as unordered * Test all sample sizes possible * Tweak random sampling algorithm for performance
This commit is contained in:
@ -47,65 +47,38 @@ foreign class Random {
|
||||
int(end) { (float() * end).floor }
|
||||
int(start, end) { (float() * (end - start)).floor + start }
|
||||
|
||||
sample(list) { sample(list, 1)[0] }
|
||||
sample(list) {
|
||||
if (list.count == 0) Fiber.abort("Not enough elements to sample.")
|
||||
return list[int(list.count)]
|
||||
}
|
||||
sample(list, count) {
|
||||
if (count > list.count) Fiber.abort("Not enough elements to sample.")
|
||||
|
||||
// There are (at least) two simple algorithms for choosing a number of
|
||||
// samples from a list without replacement -- where we don't pick the same
|
||||
// element more than once.
|
||||
//
|
||||
// The first is faster when the number of samples is small relative to the
|
||||
// size of the collection. In many cases, it avoids scanning the entire
|
||||
// list. In the common case of just wanting one sample, it's a single
|
||||
// random index lookup.
|
||||
//
|
||||
// However, its performance degrades badly as the sample size increases.
|
||||
// Vitter's algorithm always scans the entire list, but it's also always
|
||||
// O(n).
|
||||
//
|
||||
// The cutoff point between the two follows a quadratic curve on the same
|
||||
// size. Based on some empirical testing, scaling that by 5 seems to fit
|
||||
// pretty closely and chooses the fastest one for the given sample and
|
||||
// collection size.
|
||||
if (count * count * 5 < list.count) {
|
||||
// Pick random elements and retry if you hit a previously chosen one.
|
||||
var picked = {}
|
||||
var result = []
|
||||
for (i in 0...count) {
|
||||
// Find an index that we haven't already selected.
|
||||
var index
|
||||
while (true) {
|
||||
index = int(list.count)
|
||||
if (!picked.containsKey(index)) break
|
||||
}
|
||||
var result = []
|
||||
|
||||
// The algorithm described in "Programming pearls: a sample of brilliance".
|
||||
// Use a hash map for sample sizes less than 1/4 of the population size and
|
||||
// an array of booleans for larger samples. This simple heuristic improves
|
||||
// performance for large sample sizes as well as reduces memory usage.
|
||||
if (count * 4 < list.count) {
|
||||
var picked = {}
|
||||
for (i in list.count - count...list.count) {
|
||||
var index = int(i + 1)
|
||||
if (picked.containsKey(index)) index = i
|
||||
picked[index] = true
|
||||
result.add(list[index])
|
||||
}
|
||||
|
||||
return result
|
||||
} else {
|
||||
// Jeffrey Vitter's Algorithm R.
|
||||
|
||||
// Fill the reservoir with the first elements in the list.
|
||||
var result = list[0...count]
|
||||
|
||||
// We want to ensure the results are always in random order, so shuffle
|
||||
// them. In cases where the sample size is the entire collection, this
|
||||
// devolves to running Fisher-Yates on a copy of the list.
|
||||
shuffle(result)
|
||||
|
||||
// Now walk the rest of the list. For each element, randomly consider
|
||||
// replacing one of the reservoir elements with it. The probability here
|
||||
// works out such that it does this uniformly.
|
||||
for (i in count...list.count) {
|
||||
var slot = int(0, i + 1)
|
||||
if (slot < count) result[slot] = list[i]
|
||||
var picked = List.filled(list.count, false)
|
||||
for (i in list.count - count...list.count) {
|
||||
var index = int(i + 1)
|
||||
if (picked[index]) index = i
|
||||
picked[index] = true
|
||||
result.add(list[index])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
shuffle(list) {
|
||||
|
||||
@ -49,65 +49,38 @@ static const char* randomModuleSource =
|
||||
" int(end) { (float() * end).floor }\n"
|
||||
" int(start, end) { (float() * (end - start)).floor + start }\n"
|
||||
"\n"
|
||||
" sample(list) { sample(list, 1)[0] }\n"
|
||||
" sample(list) {\n"
|
||||
" if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n"
|
||||
" return list[int(list.count)]\n"
|
||||
" }\n"
|
||||
" sample(list, count) {\n"
|
||||
" if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
|
||||
"\n"
|
||||
" // There are (at least) two simple algorithms for choosing a number of\n"
|
||||
" // samples from a list without replacement -- where we don't pick the same\n"
|
||||
" // element more than once.\n"
|
||||
" //\n"
|
||||
" // The first is faster when the number of samples is small relative to the\n"
|
||||
" // size of the collection. In many cases, it avoids scanning the entire\n"
|
||||
" // list. In the common case of just wanting one sample, it's a single\n"
|
||||
" // random index lookup.\n"
|
||||
" //\n"
|
||||
" // However, its performance degrades badly as the sample size increases.\n"
|
||||
" // Vitter's algorithm always scans the entire list, but it's also always\n"
|
||||
" // O(n).\n"
|
||||
" //\n"
|
||||
" // The cutoff point between the two follows a quadratic curve on the same\n"
|
||||
" // size. Based on some empirical testing, scaling that by 5 seems to fit\n"
|
||||
" // pretty closely and chooses the fastest one for the given sample and\n"
|
||||
" // collection size.\n"
|
||||
" if (count * count * 5 < list.count) {\n"
|
||||
" // Pick random elements and retry if you hit a previously chosen one.\n"
|
||||
" var picked = {}\n"
|
||||
" var result = []\n"
|
||||
" for (i in 0...count) {\n"
|
||||
" // Find an index that we haven't already selected.\n"
|
||||
" var index\n"
|
||||
" while (true) {\n"
|
||||
" index = int(list.count)\n"
|
||||
" if (!picked.containsKey(index)) break\n"
|
||||
" }\n"
|
||||
" var result = []\n"
|
||||
"\n"
|
||||
" // The algorithm described in \"Programming pearls: a sample of brilliance\".\n"
|
||||
" // Use a hash map for sample sizes less than 1/4 of the population size and\n"
|
||||
" // an array of booleans for larger samples. This simple heuristic improves\n"
|
||||
" // performance for large sample sizes as well as reduces memory usage.\n"
|
||||
" if (count * 4 < list.count) {\n"
|
||||
" var picked = {}\n"
|
||||
" for (i in list.count - count...list.count) {\n"
|
||||
" var index = int(i + 1)\n"
|
||||
" if (picked.containsKey(index)) index = i\n"
|
||||
" picked[index] = true\n"
|
||||
" result.add(list[index])\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return result\n"
|
||||
" } else {\n"
|
||||
" // Jeffrey Vitter's Algorithm R.\n"
|
||||
"\n"
|
||||
" // Fill the reservoir with the first elements in the list.\n"
|
||||
" var result = list[0...count]\n"
|
||||
"\n"
|
||||
" // We want to ensure the results are always in random order, so shuffle\n"
|
||||
" // them. In cases where the sample size is the entire collection, this\n"
|
||||
" // devolves to running Fisher-Yates on a copy of the list.\n"
|
||||
" shuffle(result)\n"
|
||||
"\n"
|
||||
" // Now walk the rest of the list. For each element, randomly consider\n"
|
||||
" // replacing one of the reservoir elements with it. The probability here\n"
|
||||
" // works out such that it does this uniformly.\n"
|
||||
" for (i in count...list.count) {\n"
|
||||
" var slot = int(0, i + 1)\n"
|
||||
" if (slot < count) result[slot] = list[i]\n"
|
||||
" var picked = List.filled(list.count, false)\n"
|
||||
" for (i in list.count - count...list.count) {\n"
|
||||
" var index = int(i + 1)\n"
|
||||
" if (picked[index]) index = i\n"
|
||||
" picked[index] = true\n"
|
||||
" result.add(list[index])\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return result\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" return result\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" shuffle(list) {\n"
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
import "random" for Random
|
||||
|
||||
var random = Random.new(12345)
|
||||
|
||||
// Should choose all elements with roughly equal probability.
|
||||
var list = ["a", "b", "c"]
|
||||
var histogram = {}
|
||||
for (i in 1..5000) {
|
||||
var sample = random.sample(list, 3)
|
||||
var string = sample.toString
|
||||
if (!histogram.containsKey(string)) histogram[string] = 0
|
||||
histogram[string] = histogram[string] + 1
|
||||
}
|
||||
|
||||
System.print(histogram.count) // expect: 6
|
||||
for (key in histogram.keys) {
|
||||
var error = (histogram[key] / (5000 / 6) - 1).abs
|
||||
if (error > 0.1) System.print("!!! %(error)")
|
||||
}
|
||||
@ -3,17 +3,25 @@ import "random" for Random
|
||||
var random = Random.new(12345)
|
||||
|
||||
// Should choose all elements with roughly equal probability.
|
||||
var list = ["a", "b", "c", "d"]
|
||||
var histogram = {}
|
||||
for (i in 1..5000) {
|
||||
var sample = random.sample(list, 3)
|
||||
var string = sample.toString
|
||||
if (!histogram.containsKey(string)) histogram[string] = 0
|
||||
histogram[string] = histogram[string] + 1
|
||||
}
|
||||
var list = (0...10).toList
|
||||
var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1]
|
||||
|
||||
System.print(histogram.count) // expect: 24
|
||||
for (key in histogram.keys) {
|
||||
var error = (histogram[key] / (5000 / 24) - 1).abs
|
||||
if (error > 0.2) System.print("!!! %(error)")
|
||||
for (k in 0..10) {
|
||||
var count = binom[k]
|
||||
|
||||
var histogram = {}
|
||||
for (i in 1..count * 100) {
|
||||
var sample = random.sample(list, k)
|
||||
// Create a bitmask to represent the unordered set.
|
||||
var bitmask = 0
|
||||
sample.each {|s| bitmask = bitmask | (1 << s) }
|
||||
if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0
|
||||
histogram[bitmask] = histogram[bitmask] + 1
|
||||
}
|
||||
|
||||
if (histogram.count != count) System.print("!!! %(count) %(histogram.count)")
|
||||
for (key in histogram.keys) {
|
||||
var error = (histogram[key] - 100).abs
|
||||
if (error > 50) System.print("!!! %(error)")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user