From b937597ea72e633d7c9619cb1378eecb20416efb Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Fri, 20 Dec 2013 07:04:04 -0800 Subject: [PATCH] Make benchmarks score based instead of time based. --- benchmark/run_bench | 96 +++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 65 deletions(-) diff --git a/benchmark/run_bench b/benchmark/run_bench index 9b3c44ca..eb7c3736 100755 --- a/benchmark/run_bench +++ b/benchmark/run_bench @@ -7,9 +7,8 @@ import re import subprocess import sys -# How many times to run a given benchmark. Should be an odd number to get the -# right median. -NUM_TRIALS = 7 +# How many times to run a given benchmark. +NUM_TRIALS = 10 BENCHMARKS = [] @@ -59,14 +58,13 @@ def yellow(text): return '\033[33m' + text + '\033[0m' -def calc_stats(nums): - """Calculates the best, mean, and median of a list of numbers.""" - mean = sum(nums) / len(nums) - nums.sort() - median = nums[(len(nums) - 1) / 2] - diffs = ((n - mean) * (n - mean) for n in nums) - std_dev = math.sqrt(sum(diffs) / len(nums)) - return [nums[0], mean, median, std_dev] +def get_score(time): + """ + Converts time into a "score". This is the inverse of the time with an + arbitrary scale applied to get the number in a nice range. The goal here is + to have benchmark results where faster = bigger number. + """ + return 1000.0 / time def run_trial(benchmark, language): @@ -99,35 +97,34 @@ def run_benchmark_language(benchmark, language): times.append(time) sys.stdout.write(".") - times.sort() - stats = calc_stats(times) + best = min(times) + score = get_score(best) comparison = "" if language[0] == "wren": if benchmark[2] != None: - ratio = 100 * stats[0] / benchmark[2] - comparison = "{0:.2f}% of baseline".format(ratio) + ratio = 100 * score / benchmark[2] + comparison = "{:6.2f}% relative to baseline".format(ratio) if ratio > 105: - comparison = red(comparison) - if ratio < 95: comparison = green(comparison) + if ratio < 95: + comparison = red(comparison) else: comparison = "no baseline" else: # Hack: assumes wren is first language. - wren_time = results[0][2] - ratio = stats[1] / wren_time - comparison = "{0:.2f}x wren".format(ratio) - if ratio < 1: - comparison = red(comparison) - if ratio > 1: + wren_score = results[0][2] + ratio = 100.0 * wren_score / score + comparison = "{:6.2f}%".format(ratio) + if ratio > 105: comparison = green(comparison) + if ratio < 95: + comparison = red(comparison) - print " best: {0:.2f} mean: {1:.2f} median: {2:.2f} {3:s}".format( - stats[0], stats[1], stats[2], comparison) + print " {:4.0f} {:4.2f}s {:s}".format(score, best, comparison) - results.append([name, times, stats[0]]) - return stats + results.append([name, times, score]) + return score def run_benchmark(benchmark, languages): @@ -139,37 +136,6 @@ def run_benchmark(benchmark, languages): del results[0:len(results)] -# TODO(bob): Hook this up so it can be called. -def solo_benchmark(benchmark, language): - """Runs a single language benchmark repeatedly, graphing the results.""" - base = benchmark[2] - total = 0 - for i in range(0, NUM_TRIALS): - time = run_trial(benchmark, language) - total += time - ratio = 100 * time / base - - # TODO(bob): Show scale. - - line = [" "] * 51 - line[25] = "|" - index = 25 + int((time - base) * 200) - if index < 0: index = 0 - if index > 50: index = 50 - line[index] = "*" - - comparison = "{0:.4f} ({1:6.2f}%) {2}".format(time, ratio, "".join(line)) - if ratio > 105: - comparison = red(comparison) - if ratio < 95: - comparison = green(comparison) - print comparison - - total /= NUM_TRIALS - print "----" - print "{0:.4f} ({1:6.2f}%)".format(total, 100 * total / base) - - def graph_results(): print @@ -180,17 +146,17 @@ def graph_results(): '0': '0' } - # Scale everything by the highest time. + # Scale everything by the highest score. highest = 0 for result in results: - time = max(result[1]) - if time > highest: highest = time + score = get_score(min(result[1])) + if score > highest: highest = score - print "{0:22s}0.0 {1:64.4f}".format("", highest) + print "{0:22s}0 {1:66.0f}".format("", highest) for result in results: line = ["-"] * 68 for time in result[1]: - index = int(time / highest * 67) + index = int(get_score(time) / highest * 67) line[index] = INCREMENT[line[index]] print "{0:22s}{1}".format(result[0], "".join(line)) print @@ -210,8 +176,8 @@ def generate_baseline(): print "generating baseline" baseline_text = "" for benchmark in BENCHMARKS: - stats = run_benchmark_language(benchmark, LANGUAGES[0]) - baseline_text += ("{},{}\n".format(benchmark[0], stats[0])) + best = run_benchmark_language(benchmark, LANGUAGES[0]) + baseline_text += ("{},{}\n".format(benchmark[0], best)) # Write them to a file. with open("baseline.txt", 'w') as out: