Files
wren/util/benchmark.py
Bob Nystrom 8210452970 Relative imports!
This is a breaking change because existing imports in user Wren code
that assume the path is relative to the entrypoint file will now likely
fail.

Also, stack trace output and host API calls that take a module string
now need the resolved module string, not the short name that appears in
the import.
2018-03-24 11:10:36 -07:00

378 lines
10 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
import argparse
import math
import os
import os.path
import re
import subprocess
import sys
# Runs the benchmarks.
#
# It runs several benchmarks across several languages. For each
# benchmark/language pair, it runs a number of trials. Each trial is one run of
# a single benchmark script. It spawns a process and runs the script. The
# script itself is expected to output some result which this script validates
# to ensure the benchmark is running correctly. Then the benchmark prints an
# elapsed time. The benchmark is expected to do the timing itself and only time
# the interesting code under test.
#
# This script then runs several trials and takes the best score. (It does
# multiple trials to account for random variance in running time coming from
# OS, CPU rate-limiting, etc.) It takes the best time on the assumption that
# that represents the language's ideal performance and any variance coming from
# the OS will just slow it down.
#
# After running a series of trials the benchmark runner will compare all of the
# language's performance for a given benchmark. It compares by running time
# and score, which is just the inverse running time.
#
# For Wren benchmarks, it can also compare against a "baseline". That's a
# recorded result of a previous run of the Wren benchmarks. This is useful --
# critical, actually -- for seeing how Wren performance changes. Generating a
# set of baselines before a change to the VM and then comparing those to the
# performance after a change is how we track improvements and regressions.
#
# To generate a baseline file, run this script with "--generate-baseline".
WREN_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
WREN_BIN = os.path.join(WREN_DIR, 'bin')
BENCHMARK_DIR = os.path.join('test', 'benchmark')
# How many times to run a given benchmark.
NUM_TRIALS = 10
BENCHMARKS = []
def BENCHMARK(name, pattern):
regex = re.compile(pattern + "\n" + r"elapsed: (\d+\.\d+)", re.MULTILINE)
BENCHMARKS.append([name, regex, None])
BENCHMARK("api_call", "true")
BENCHMARK("api_foreign_method", "100000000")
BENCHMARK("binary_trees", """stretch tree of depth 13 check: -1
8192 trees of depth 4 check: -8192
2048 trees of depth 6 check: -2048
512 trees of depth 8 check: -512
128 trees of depth 10 check: -128
32 trees of depth 12 check: -32
long lived tree of depth 12 check: -1""")
BENCHMARK("binary_trees_gc", """stretch tree of depth 13 check: -1
8192 trees of depth 4 check: -8192
2048 trees of depth 6 check: -2048
512 trees of depth 8 check: -512
128 trees of depth 10 check: -128
32 trees of depth 12 check: -32
long lived tree of depth 12 check: -1""")
BENCHMARK("delta_blue", "14065400")
BENCHMARK("fib", r"""317811
317811
317811
317811
317811""")
BENCHMARK("fibers", r"""4999950000""")
BENCHMARK("for", r"""499999500000""")
BENCHMARK("method_call", r"""true
false""")
BENCHMARK("map_numeric", r"""500000500000""")
BENCHMARK("map_string", r"""12799920000""")
BENCHMARK("string_equals", r"""3000000""")
LANGUAGES = [
("wren", [os.path.join(WREN_BIN, 'wren')], ".wren"),
("dart", ["fletch", "run"], ".dart"),
("lua", ["lua"], ".lua"),
("luajit (-joff)", ["luajit", "-joff"], ".lua"),
("python", ["python"], ".py"),
("python3", ["python3"], ".py"),
("ruby", ["ruby"], ".rb")
]
results = {}
if sys.platform == 'win32':
GREEN = NORMAL = RED = YELLOW = ''
else:
GREEN = '\033[32m'
NORMAL = '\033[0m'
RED = '\033[31m'
YELLOW = '\033[33m'
def green(text):
return GREEN + text + NORMAL
def red(text):
return RED + text + NORMAL
def yellow(text):
return YELLOW + text + NORMAL
def get_score(time):
"""
Converts time into a "score". This is the inverse of the time with an
arbitrary scale applied to get the number in a nice range. The goal here is
to have benchmark results where faster = bigger number.
"""
return 1000.0 / time
def standard_deviation(times):
"""
Calculates the standard deviation of a list of numbers.
"""
mean = sum(times) / len(times)
# Sum the squares of the differences from the mean.
result = 0
for time in times:
result += (time - mean) ** 2
return math.sqrt(result / len(times))
def run_trial(benchmark, language):
"""Runs one benchmark one time for one language."""
executable_args = language[1]
# Hackish. If the benchmark name starts with "api_", it's testing the Wren
# C API, so run the test_api executable which has those test methods instead
# of the normal Wren build.
if benchmark[0].startswith("api_"):
executable_args = [
os.path.join(WREN_DIR, "build", "release", "test", "api_wren")
]
args = []
args.extend(executable_args)
args.append(os.path.join(BENCHMARK_DIR, benchmark[0] + language[2]))
try:
out = subprocess.check_output(args, universal_newlines=True)
except OSError:
print('Interpreter was not found')
return None
match = benchmark[1].match(out)
if match:
return float(match.group(1))
else:
print("Incorrect output:")
print(out)
return None
def run_benchmark_language(benchmark, language, benchmark_result):
"""
Runs one benchmark for a number of trials for one language.
Adds the result to benchmark_result, which is a map of language names to
results.
"""
name = "{0} - {1}".format(benchmark[0], language[0])
print("{0:30s}".format(name), end=' ')
if not os.path.exists(os.path.join(
BENCHMARK_DIR, benchmark[0] + language[2])):
print("No implementation for this language")
return
times = []
for i in range(0, NUM_TRIALS):
sys.stdout.flush()
time = run_trial(benchmark, language)
if not time:
return
times.append(time)
sys.stdout.write(".")
best = min(times)
score = get_score(best)
comparison = ""
if language[0] == "wren":
if benchmark[2] != None:
ratio = 100 * score / benchmark[2]
comparison = "{:6.2f}% relative to baseline".format(ratio)
if ratio > 105:
comparison = green(comparison)
if ratio < 95:
comparison = red(comparison)
else:
comparison = "no baseline"
else:
# Hack: assumes wren gets run first.
wren_score = benchmark_result["wren"]["score"]
ratio = 100.0 * wren_score / score
comparison = "{:6.2f}%".format(ratio)
if ratio > 105:
comparison = green(comparison)
if ratio < 95:
comparison = red(comparison)
print(" {:4.2f}s {:4.4f} {:s}".format(
best,
standard_deviation(times),
comparison))
benchmark_result[language[0]] = {
"desc": name,
"times": times,
"score": score
}
return score
def run_benchmark(benchmark, languages, graph):
"""Runs one benchmark for the given languages (or all of them)."""
benchmark_result = {}
results[benchmark[0]] = benchmark_result
num_languages = 0
for language in LANGUAGES:
if not languages or language[0] in languages:
num_languages += 1
run_benchmark_language(benchmark, language, benchmark_result)
if num_languages > 1 and graph:
graph_results(benchmark_result)
def graph_results(benchmark_result):
print()
INCREMENT = {
'-': 'o',
'o': 'O',
'O': '0',
'0': '0'
}
# Scale everything by the highest score.
highest = 0
for language, result in benchmark_result.items():
score = get_score(min(result["times"]))
if score > highest: highest = score
print("{0:30s}0 {1:66.0f}".format("", highest))
for language, result in benchmark_result.items():
line = ["-"] * 68
for time in result["times"]:
index = int(get_score(time) / highest * 67)
line[index] = INCREMENT[line[index]]
print("{0:30s}{1}".format(result["desc"], "".join(line)))
print()
def read_baseline():
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
if os.path.exists(baseline_file):
with open(baseline_file) as f:
for line in f.readlines():
name, best = line.split(",")
for benchmark in BENCHMARKS:
if benchmark[0] == name:
benchmark[2] = float(best)
def generate_baseline():
print("generating baseline")
baseline_text = ""
for benchmark in BENCHMARKS:
best = run_benchmark_language(benchmark, LANGUAGES[0], {})
baseline_text += ("{},{}\n".format(benchmark[0], best))
# Write them to a file.
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
with open(baseline_file, 'w') as out:
out.write(baseline_text)
def print_html():
'''Print the results as an HTML chart.'''
def print_benchmark(benchmark, name):
print('<h3>{}</h3>'.format(name))
print('<table class="chart">')
# Scale everything by the highest time.
highest = 0
for language, result in results[benchmark].items():
time = min(result["times"])
if time > highest: highest = time
languages = sorted(results[benchmark].keys(),
key=lambda lang: results[benchmark][lang]["score"], reverse=True)
for language in languages:
result = results[benchmark][language]
time = float(min(result["times"]))
ratio = int(100 * time / highest)
css_class = "chart-bar"
if language == "wren":
css_class += " wren"
print(' <tr>')
print(' <th>{}</th><td><div class="{}" style="width: {}%;">{:4.2f}s&nbsp;</div></td>'.format(
language, css_class, ratio, time))
print(' </tr>')
print('</table>')
print_benchmark("method_call", "Method Call")
print_benchmark("delta_blue", "DeltaBlue")
print_benchmark("binary_trees", "Binary Trees")
print_benchmark("fib", "Recursive Fibonacci")
def main():
parser = argparse.ArgumentParser(description="Run the benchmarks")
parser.add_argument("benchmark", nargs='?',
default="all",
help="The benchmark to run")
parser.add_argument("--generate-baseline",
action="store_true",
help="Generate a baseline file")
parser.add_argument("--graph",
action="store_true",
help="Display graph results.")
parser.add_argument("-l", "--language",
action="append",
help="Which language(s) to run benchmarks for")
parser.add_argument("--output-html",
action="store_true",
help="Output the results chart as HTML")
args = parser.parse_args()
if args.generate_baseline:
generate_baseline()
return
read_baseline()
# Run the benchmarks.
for benchmark in BENCHMARKS:
if benchmark[0] == args.benchmark or args.benchmark == "all":
run_benchmark(benchmark, args.language, args.graph)
if args.output_html:
print_html()
main()