mirror of
https://github.com/wren-lang/wren.git
synced 2026-01-11 14:18:42 +01:00
Optimize Random.sample(_, _) for performance (#716)
* Optimize Random.sample(_, _) for performance * Make tests treat random samples as unordered * Test all sample sizes possible * Tweak random sampling algorithm for performance
This commit is contained in:
@ -47,65 +47,38 @@ foreign class Random {
|
|||||||
int(end) { (float() * end).floor }
|
int(end) { (float() * end).floor }
|
||||||
int(start, end) { (float() * (end - start)).floor + start }
|
int(start, end) { (float() * (end - start)).floor + start }
|
||||||
|
|
||||||
sample(list) { sample(list, 1)[0] }
|
sample(list) {
|
||||||
|
if (list.count == 0) Fiber.abort("Not enough elements to sample.")
|
||||||
|
return list[int(list.count)]
|
||||||
|
}
|
||||||
sample(list, count) {
|
sample(list, count) {
|
||||||
if (count > list.count) Fiber.abort("Not enough elements to sample.")
|
if (count > list.count) Fiber.abort("Not enough elements to sample.")
|
||||||
|
|
||||||
// There are (at least) two simple algorithms for choosing a number of
|
var result = []
|
||||||
// samples from a list without replacement -- where we don't pick the same
|
|
||||||
// element more than once.
|
|
||||||
//
|
|
||||||
// The first is faster when the number of samples is small relative to the
|
|
||||||
// size of the collection. In many cases, it avoids scanning the entire
|
|
||||||
// list. In the common case of just wanting one sample, it's a single
|
|
||||||
// random index lookup.
|
|
||||||
//
|
|
||||||
// However, its performance degrades badly as the sample size increases.
|
|
||||||
// Vitter's algorithm always scans the entire list, but it's also always
|
|
||||||
// O(n).
|
|
||||||
//
|
|
||||||
// The cutoff point between the two follows a quadratic curve on the same
|
|
||||||
// size. Based on some empirical testing, scaling that by 5 seems to fit
|
|
||||||
// pretty closely and chooses the fastest one for the given sample and
|
|
||||||
// collection size.
|
|
||||||
if (count * count * 5 < list.count) {
|
|
||||||
// Pick random elements and retry if you hit a previously chosen one.
|
|
||||||
var picked = {}
|
|
||||||
var result = []
|
|
||||||
for (i in 0...count) {
|
|
||||||
// Find an index that we haven't already selected.
|
|
||||||
var index
|
|
||||||
while (true) {
|
|
||||||
index = int(list.count)
|
|
||||||
if (!picked.containsKey(index)) break
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// The algorithm described in "Programming pearls: a sample of brilliance".
|
||||||
|
// Use a hash map for sample sizes less than 1/4 of the population size and
|
||||||
|
// an array of booleans for larger samples. This simple heuristic improves
|
||||||
|
// performance for large sample sizes as well as reduces memory usage.
|
||||||
|
if (count * 4 < list.count) {
|
||||||
|
var picked = {}
|
||||||
|
for (i in list.count - count...list.count) {
|
||||||
|
var index = int(i + 1)
|
||||||
|
if (picked.containsKey(index)) index = i
|
||||||
picked[index] = true
|
picked[index] = true
|
||||||
result.add(list[index])
|
result.add(list[index])
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
|
||||||
} else {
|
} else {
|
||||||
// Jeffrey Vitter's Algorithm R.
|
var picked = List.filled(list.count, false)
|
||||||
|
for (i in list.count - count...list.count) {
|
||||||
// Fill the reservoir with the first elements in the list.
|
var index = int(i + 1)
|
||||||
var result = list[0...count]
|
if (picked[index]) index = i
|
||||||
|
picked[index] = true
|
||||||
// We want to ensure the results are always in random order, so shuffle
|
result.add(list[index])
|
||||||
// them. In cases where the sample size is the entire collection, this
|
|
||||||
// devolves to running Fisher-Yates on a copy of the list.
|
|
||||||
shuffle(result)
|
|
||||||
|
|
||||||
// Now walk the rest of the list. For each element, randomly consider
|
|
||||||
// replacing one of the reservoir elements with it. The probability here
|
|
||||||
// works out such that it does this uniformly.
|
|
||||||
for (i in count...list.count) {
|
|
||||||
var slot = int(0, i + 1)
|
|
||||||
if (slot < count) result[slot] = list[i]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
shuffle(list) {
|
shuffle(list) {
|
||||||
|
|||||||
@ -49,65 +49,38 @@ static const char* randomModuleSource =
|
|||||||
" int(end) { (float() * end).floor }\n"
|
" int(end) { (float() * end).floor }\n"
|
||||||
" int(start, end) { (float() * (end - start)).floor + start }\n"
|
" int(start, end) { (float() * (end - start)).floor + start }\n"
|
||||||
"\n"
|
"\n"
|
||||||
" sample(list) { sample(list, 1)[0] }\n"
|
" sample(list) {\n"
|
||||||
|
" if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n"
|
||||||
|
" return list[int(list.count)]\n"
|
||||||
|
" }\n"
|
||||||
" sample(list, count) {\n"
|
" sample(list, count) {\n"
|
||||||
" if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
|
" if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
|
||||||
"\n"
|
"\n"
|
||||||
" // There are (at least) two simple algorithms for choosing a number of\n"
|
" var result = []\n"
|
||||||
" // samples from a list without replacement -- where we don't pick the same\n"
|
|
||||||
" // element more than once.\n"
|
|
||||||
" //\n"
|
|
||||||
" // The first is faster when the number of samples is small relative to the\n"
|
|
||||||
" // size of the collection. In many cases, it avoids scanning the entire\n"
|
|
||||||
" // list. In the common case of just wanting one sample, it's a single\n"
|
|
||||||
" // random index lookup.\n"
|
|
||||||
" //\n"
|
|
||||||
" // However, its performance degrades badly as the sample size increases.\n"
|
|
||||||
" // Vitter's algorithm always scans the entire list, but it's also always\n"
|
|
||||||
" // O(n).\n"
|
|
||||||
" //\n"
|
|
||||||
" // The cutoff point between the two follows a quadratic curve on the same\n"
|
|
||||||
" // size. Based on some empirical testing, scaling that by 5 seems to fit\n"
|
|
||||||
" // pretty closely and chooses the fastest one for the given sample and\n"
|
|
||||||
" // collection size.\n"
|
|
||||||
" if (count * count * 5 < list.count) {\n"
|
|
||||||
" // Pick random elements and retry if you hit a previously chosen one.\n"
|
|
||||||
" var picked = {}\n"
|
|
||||||
" var result = []\n"
|
|
||||||
" for (i in 0...count) {\n"
|
|
||||||
" // Find an index that we haven't already selected.\n"
|
|
||||||
" var index\n"
|
|
||||||
" while (true) {\n"
|
|
||||||
" index = int(list.count)\n"
|
|
||||||
" if (!picked.containsKey(index)) break\n"
|
|
||||||
" }\n"
|
|
||||||
"\n"
|
"\n"
|
||||||
|
" // The algorithm described in \"Programming pearls: a sample of brilliance\".\n"
|
||||||
|
" // Use a hash map for sample sizes less than 1/4 of the population size and\n"
|
||||||
|
" // an array of booleans for larger samples. This simple heuristic improves\n"
|
||||||
|
" // performance for large sample sizes as well as reduces memory usage.\n"
|
||||||
|
" if (count * 4 < list.count) {\n"
|
||||||
|
" var picked = {}\n"
|
||||||
|
" for (i in list.count - count...list.count) {\n"
|
||||||
|
" var index = int(i + 1)\n"
|
||||||
|
" if (picked.containsKey(index)) index = i\n"
|
||||||
" picked[index] = true\n"
|
" picked[index] = true\n"
|
||||||
" result.add(list[index])\n"
|
" result.add(list[index])\n"
|
||||||
" }\n"
|
" }\n"
|
||||||
"\n"
|
|
||||||
" return result\n"
|
|
||||||
" } else {\n"
|
" } else {\n"
|
||||||
" // Jeffrey Vitter's Algorithm R.\n"
|
" var picked = List.filled(list.count, false)\n"
|
||||||
"\n"
|
" for (i in list.count - count...list.count) {\n"
|
||||||
" // Fill the reservoir with the first elements in the list.\n"
|
" var index = int(i + 1)\n"
|
||||||
" var result = list[0...count]\n"
|
" if (picked[index]) index = i\n"
|
||||||
"\n"
|
" picked[index] = true\n"
|
||||||
" // We want to ensure the results are always in random order, so shuffle\n"
|
" result.add(list[index])\n"
|
||||||
" // them. In cases where the sample size is the entire collection, this\n"
|
|
||||||
" // devolves to running Fisher-Yates on a copy of the list.\n"
|
|
||||||
" shuffle(result)\n"
|
|
||||||
"\n"
|
|
||||||
" // Now walk the rest of the list. For each element, randomly consider\n"
|
|
||||||
" // replacing one of the reservoir elements with it. The probability here\n"
|
|
||||||
" // works out such that it does this uniformly.\n"
|
|
||||||
" for (i in count...list.count) {\n"
|
|
||||||
" var slot = int(0, i + 1)\n"
|
|
||||||
" if (slot < count) result[slot] = list[i]\n"
|
|
||||||
" }\n"
|
" }\n"
|
||||||
"\n"
|
|
||||||
" return result\n"
|
|
||||||
" }\n"
|
" }\n"
|
||||||
|
"\n"
|
||||||
|
" return result\n"
|
||||||
" }\n"
|
" }\n"
|
||||||
"\n"
|
"\n"
|
||||||
" shuffle(list) {\n"
|
" shuffle(list) {\n"
|
||||||
|
|||||||
@ -1,19 +0,0 @@
|
|||||||
import "random" for Random
|
|
||||||
|
|
||||||
var random = Random.new(12345)
|
|
||||||
|
|
||||||
// Should choose all elements with roughly equal probability.
|
|
||||||
var list = ["a", "b", "c"]
|
|
||||||
var histogram = {}
|
|
||||||
for (i in 1..5000) {
|
|
||||||
var sample = random.sample(list, 3)
|
|
||||||
var string = sample.toString
|
|
||||||
if (!histogram.containsKey(string)) histogram[string] = 0
|
|
||||||
histogram[string] = histogram[string] + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
System.print(histogram.count) // expect: 6
|
|
||||||
for (key in histogram.keys) {
|
|
||||||
var error = (histogram[key] / (5000 / 6) - 1).abs
|
|
||||||
if (error > 0.1) System.print("!!! %(error)")
|
|
||||||
}
|
|
||||||
@ -3,17 +3,25 @@ import "random" for Random
|
|||||||
var random = Random.new(12345)
|
var random = Random.new(12345)
|
||||||
|
|
||||||
// Should choose all elements with roughly equal probability.
|
// Should choose all elements with roughly equal probability.
|
||||||
var list = ["a", "b", "c", "d"]
|
var list = (0...10).toList
|
||||||
var histogram = {}
|
var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1]
|
||||||
for (i in 1..5000) {
|
|
||||||
var sample = random.sample(list, 3)
|
|
||||||
var string = sample.toString
|
|
||||||
if (!histogram.containsKey(string)) histogram[string] = 0
|
|
||||||
histogram[string] = histogram[string] + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
System.print(histogram.count) // expect: 24
|
for (k in 0..10) {
|
||||||
for (key in histogram.keys) {
|
var count = binom[k]
|
||||||
var error = (histogram[key] / (5000 / 24) - 1).abs
|
|
||||||
if (error > 0.2) System.print("!!! %(error)")
|
var histogram = {}
|
||||||
|
for (i in 1..count * 100) {
|
||||||
|
var sample = random.sample(list, k)
|
||||||
|
// Create a bitmask to represent the unordered set.
|
||||||
|
var bitmask = 0
|
||||||
|
sample.each {|s| bitmask = bitmask | (1 << s) }
|
||||||
|
if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0
|
||||||
|
histogram[bitmask] = histogram[bitmask] + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if (histogram.count != count) System.print("!!! %(count) %(histogram.count)")
|
||||||
|
for (key in histogram.keys) {
|
||||||
|
var error = (histogram[key] - 100).abs
|
||||||
|
if (error > 50) System.print("!!! %(error)")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user