Optimize Random.sample(_, _) for performance (#716)

* Optimize Random.sample(_, _) for performance
* Make tests treat random samples as unordered
* Test all sample sizes possible
* Tweak random sampling algorithm for performance
This commit is contained in:
kawa-yoiko
2020-06-14 12:31:23 +08:00
committed by GitHub
parent fea0dfafa0
commit f3493d0499
4 changed files with 64 additions and 129 deletions

View File

@ -47,65 +47,38 @@ foreign class Random {
int(end) { (float() * end).floor } int(end) { (float() * end).floor }
int(start, end) { (float() * (end - start)).floor + start } int(start, end) { (float() * (end - start)).floor + start }
sample(list) { sample(list, 1)[0] } sample(list) {
if (list.count == 0) Fiber.abort("Not enough elements to sample.")
return list[int(list.count)]
}
sample(list, count) { sample(list, count) {
if (count > list.count) Fiber.abort("Not enough elements to sample.") if (count > list.count) Fiber.abort("Not enough elements to sample.")
// There are (at least) two simple algorithms for choosing a number of var result = []
// samples from a list without replacement -- where we don't pick the same
// element more than once.
//
// The first is faster when the number of samples is small relative to the
// size of the collection. In many cases, it avoids scanning the entire
// list. In the common case of just wanting one sample, it's a single
// random index lookup.
//
// However, its performance degrades badly as the sample size increases.
// Vitter's algorithm always scans the entire list, but it's also always
// O(n).
//
// The cutoff point between the two follows a quadratic curve on the same
// size. Based on some empirical testing, scaling that by 5 seems to fit
// pretty closely and chooses the fastest one for the given sample and
// collection size.
if (count * count * 5 < list.count) {
// Pick random elements and retry if you hit a previously chosen one.
var picked = {}
var result = []
for (i in 0...count) {
// Find an index that we haven't already selected.
var index
while (true) {
index = int(list.count)
if (!picked.containsKey(index)) break
}
// The algorithm described in "Programming pearls: a sample of brilliance".
// Use a hash map for sample sizes less than 1/4 of the population size and
// an array of booleans for larger samples. This simple heuristic improves
// performance for large sample sizes as well as reduces memory usage.
if (count * 4 < list.count) {
var picked = {}
for (i in list.count - count...list.count) {
var index = int(i + 1)
if (picked.containsKey(index)) index = i
picked[index] = true picked[index] = true
result.add(list[index]) result.add(list[index])
} }
return result
} else { } else {
// Jeffrey Vitter's Algorithm R. var picked = List.filled(list.count, false)
for (i in list.count - count...list.count) {
// Fill the reservoir with the first elements in the list. var index = int(i + 1)
var result = list[0...count] if (picked[index]) index = i
picked[index] = true
// We want to ensure the results are always in random order, so shuffle result.add(list[index])
// them. In cases where the sample size is the entire collection, this
// devolves to running Fisher-Yates on a copy of the list.
shuffle(result)
// Now walk the rest of the list. For each element, randomly consider
// replacing one of the reservoir elements with it. The probability here
// works out such that it does this uniformly.
for (i in count...list.count) {
var slot = int(0, i + 1)
if (slot < count) result[slot] = list[i]
} }
return result
} }
return result
} }
shuffle(list) { shuffle(list) {

View File

@ -49,65 +49,38 @@ static const char* randomModuleSource =
" int(end) { (float() * end).floor }\n" " int(end) { (float() * end).floor }\n"
" int(start, end) { (float() * (end - start)).floor + start }\n" " int(start, end) { (float() * (end - start)).floor + start }\n"
"\n" "\n"
" sample(list) { sample(list, 1)[0] }\n" " sample(list) {\n"
" if (list.count == 0) Fiber.abort(\"Not enough elements to sample.\")\n"
" return list[int(list.count)]\n"
" }\n"
" sample(list, count) {\n" " sample(list, count) {\n"
" if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n" " if (count > list.count) Fiber.abort(\"Not enough elements to sample.\")\n"
"\n" "\n"
" // There are (at least) two simple algorithms for choosing a number of\n" " var result = []\n"
" // samples from a list without replacement -- where we don't pick the same\n"
" // element more than once.\n"
" //\n"
" // The first is faster when the number of samples is small relative to the\n"
" // size of the collection. In many cases, it avoids scanning the entire\n"
" // list. In the common case of just wanting one sample, it's a single\n"
" // random index lookup.\n"
" //\n"
" // However, its performance degrades badly as the sample size increases.\n"
" // Vitter's algorithm always scans the entire list, but it's also always\n"
" // O(n).\n"
" //\n"
" // The cutoff point between the two follows a quadratic curve on the same\n"
" // size. Based on some empirical testing, scaling that by 5 seems to fit\n"
" // pretty closely and chooses the fastest one for the given sample and\n"
" // collection size.\n"
" if (count * count * 5 < list.count) {\n"
" // Pick random elements and retry if you hit a previously chosen one.\n"
" var picked = {}\n"
" var result = []\n"
" for (i in 0...count) {\n"
" // Find an index that we haven't already selected.\n"
" var index\n"
" while (true) {\n"
" index = int(list.count)\n"
" if (!picked.containsKey(index)) break\n"
" }\n"
"\n" "\n"
" // The algorithm described in \"Programming pearls: a sample of brilliance\".\n"
" // Use a hash map for sample sizes less than 1/4 of the population size and\n"
" // an array of booleans for larger samples. This simple heuristic improves\n"
" // performance for large sample sizes as well as reduces memory usage.\n"
" if (count * 4 < list.count) {\n"
" var picked = {}\n"
" for (i in list.count - count...list.count) {\n"
" var index = int(i + 1)\n"
" if (picked.containsKey(index)) index = i\n"
" picked[index] = true\n" " picked[index] = true\n"
" result.add(list[index])\n" " result.add(list[index])\n"
" }\n" " }\n"
"\n"
" return result\n"
" } else {\n" " } else {\n"
" // Jeffrey Vitter's Algorithm R.\n" " var picked = List.filled(list.count, false)\n"
"\n" " for (i in list.count - count...list.count) {\n"
" // Fill the reservoir with the first elements in the list.\n" " var index = int(i + 1)\n"
" var result = list[0...count]\n" " if (picked[index]) index = i\n"
"\n" " picked[index] = true\n"
" // We want to ensure the results are always in random order, so shuffle\n" " result.add(list[index])\n"
" // them. In cases where the sample size is the entire collection, this\n"
" // devolves to running Fisher-Yates on a copy of the list.\n"
" shuffle(result)\n"
"\n"
" // Now walk the rest of the list. For each element, randomly consider\n"
" // replacing one of the reservoir elements with it. The probability here\n"
" // works out such that it does this uniformly.\n"
" for (i in count...list.count) {\n"
" var slot = int(0, i + 1)\n"
" if (slot < count) result[slot] = list[i]\n"
" }\n" " }\n"
"\n"
" return result\n"
" }\n" " }\n"
"\n"
" return result\n"
" }\n" " }\n"
"\n" "\n"
" shuffle(list) {\n" " shuffle(list) {\n"

View File

@ -1,19 +0,0 @@
import "random" for Random
var random = Random.new(12345)
// Should choose all elements with roughly equal probability.
var list = ["a", "b", "c"]
var histogram = {}
for (i in 1..5000) {
var sample = random.sample(list, 3)
var string = sample.toString
if (!histogram.containsKey(string)) histogram[string] = 0
histogram[string] = histogram[string] + 1
}
System.print(histogram.count) // expect: 6
for (key in histogram.keys) {
var error = (histogram[key] / (5000 / 6) - 1).abs
if (error > 0.1) System.print("!!! %(error)")
}

View File

@ -3,17 +3,25 @@ import "random" for Random
var random = Random.new(12345) var random = Random.new(12345)
// Should choose all elements with roughly equal probability. // Should choose all elements with roughly equal probability.
var list = ["a", "b", "c", "d"] var list = (0...10).toList
var histogram = {} var binom = [1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1]
for (i in 1..5000) {
var sample = random.sample(list, 3)
var string = sample.toString
if (!histogram.containsKey(string)) histogram[string] = 0
histogram[string] = histogram[string] + 1
}
System.print(histogram.count) // expect: 24 for (k in 0..10) {
for (key in histogram.keys) { var count = binom[k]
var error = (histogram[key] / (5000 / 24) - 1).abs
if (error > 0.2) System.print("!!! %(error)") var histogram = {}
for (i in 1..count * 100) {
var sample = random.sample(list, k)
// Create a bitmask to represent the unordered set.
var bitmask = 0
sample.each {|s| bitmask = bitmask | (1 << s) }
if (!histogram.containsKey(bitmask)) histogram[bitmask] = 0
histogram[bitmask] = histogram[bitmask] + 1
}
if (histogram.count != count) System.print("!!! %(count) %(histogram.count)")
for (key in histogram.keys) {
var error = (histogram[key] - 100).abs
if (error > 50) System.print("!!! %(error)")
}
} }