1
0
forked from Mirror/wren
Files
wren/script/benchmark.py
Bob Nystrom 64eccdd9be Reorganize tests and benchmark scripts.
Mainly to get rid of one top level directory. But this will
also be useful when there are tests of the embedding API.
2015-03-14 12:45:56 -07:00

326 lines
9.2 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
import argparse
import math
import os
import os.path
import re
import subprocess
import sys
# Runs the benchmarks.
#
# It runs several benchmarks across several languages. For each
# benchmark/language pair, it runs a number of trials. Each trial is one run of
# a single benchmark script. It spawns a process and runs the script. The
# script itself is expected to output some result which this script validates
# to ensure the benchmark is running correctly. Then the benchmark prints an
# elapsed time. The benchmark is expected to do the timing itself and only time
# the interesting code under test.
#
# This script then runs several trials and takes the best score. (It does
# multiple trials to account for random variance in running time coming from
# OS, CPU rate-limiting, etc.) It takes the best time on the assumption that
# that represents the language's ideal performance and any variance coming from
# the OS will just slow it down.
#
# After running a series of trials the benchmark runner will compare all of the
# language's performance for a given benchmark. It compares by running time
# and score, which is just the inverse running time.
#
# For Wren benchmarks, it can also compare against a "baseline". That's a
# recorded result of a previous run of the Wren benchmarks. This is useful --
# critical, actually -- for seeing how Wren performance changes. Generating a
# set of baselines before a change to the VM and then comparing those to the
# performance after a change is how we track improvements and regressions.
#
# To generate a baseline file, run this script with "--generate-baseline".
WREN_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
BENCHMARK_DIR = os.path.join(WREN_DIR, 'test', 'benchmark')
# How many times to run a given benchmark.
NUM_TRIALS = 10
BENCHMARKS = []
def BENCHMARK(name, pattern):
regex = re.compile(pattern + "\n" + r"elapsed: (\d+\.\d+)", re.MULTILINE)
BENCHMARKS.append([name, regex, None])
BENCHMARK("binary_trees", """stretch tree of depth 13 check: -1
8192 trees of depth 4 check: -8192
2048 trees of depth 6 check: -2048
512 trees of depth 8 check: -512
128 trees of depth 10 check: -128
32 trees of depth 12 check: -32
long lived tree of depth 12 check: -1""")
BENCHMARK("delta_blue", "7032700")
BENCHMARK("fib", r"""317811
317811
317811
317811
317811""")
BENCHMARK("for", r"""499999500000""")
BENCHMARK("method_call", r"""true
false""")
BENCHMARK("map_numeric", r"""500000500000""")
BENCHMARK("map_string", r"""3645600""")
LANGUAGES = [
("wren", [os.path.join(WREN_DIR, 'wren')], ".wren"),
("lua", ["lua"], ".lua"),
("luajit (-joff)", ["luajit", "-joff"], ".lua"),
("python", ["python"], ".py"),
("python3", ["python3"], ".py"),
("ruby", ["ruby"], ".rb")
]
results = {}
def green(text):
if sys.platform == 'win32':
return text
return '\033[32m' + text + '\033[0m'
def red(text):
if sys.platform == 'win32':
return text
return '\033[31m' + text + '\033[0m'
def yellow(text):
if sys.platform == 'win32':
return text
return '\033[33m' + text + '\033[0m'
def get_score(time):
"""
Converts time into a "score". This is the inverse of the time with an
arbitrary scale applied to get the number in a nice range. The goal here is
to have benchmark results where faster = bigger number.
"""
return 1000.0 / time
def run_trial(benchmark, language):
"""Runs one benchmark one time for one language."""
args = []
args.extend(language[1])
args.append(os.path.join(BENCHMARK_DIR, benchmark[0] + language[2]))
try:
out = subprocess.check_output(args, universal_newlines=True)
except OSError:
print('Interpreter was not found')
return None
match = benchmark[1].match(out)
if match:
return float(match.group(1))
else:
print("Incorrect output:")
print(out)
return None
def run_benchmark_language(benchmark, language, benchmark_result):
"""
Runs one benchmark for a number of trials for one language.
Adds the result to benchmark_result, which is a map of language names to
results.
"""
name = "{0} - {1}".format(benchmark[0], language[0])
print("{0:30s}".format(name), end=' ')
if not os.path.exists(os.path.join(
BENCHMARK_DIR, benchmark[0] + language[2])):
print("No implementation for this language")
return
times = []
for i in range(0, NUM_TRIALS):
time = run_trial(benchmark, language)
if not time:
return
times.append(time)
sys.stdout.write(".")
best = min(times)
score = get_score(best)
comparison = ""
if language[0] == "wren":
if benchmark[2] != None:
ratio = 100 * score / benchmark[2]
comparison = "{:6.2f}% relative to baseline".format(ratio)
if ratio > 105:
comparison = green(comparison)
if ratio < 95:
comparison = red(comparison)
else:
comparison = "no baseline"
else:
# Hack: assumes wren gets run first.
wren_score = benchmark_result["wren"]["score"]
ratio = 100.0 * wren_score / score
comparison = "{:6.2f}%".format(ratio)
if ratio > 105:
comparison = green(comparison)
if ratio < 95:
comparison = red(comparison)
print(" {:5.0f} {:4.2f}s {:s}".format(score, best, comparison))
benchmark_result[language[0]] = {
"desc": name,
"times": times,
"score": score
}
return score
def run_benchmark(benchmark, languages):
"""Runs one benchmark for the given languages (or all of them)."""
benchmark_result = {}
results[benchmark[0]] = benchmark_result
num_languages = 0
for language in LANGUAGES:
if not languages or language[0] in languages:
num_languages += 1
run_benchmark_language(benchmark, language, benchmark_result)
if num_languages > 1:
graph_results(benchmark_result)
def graph_results(benchmark_result):
print()
INCREMENT = {
'-': 'o',
'o': 'O',
'O': '0',
'0': '0'
}
# Scale everything by the highest score.
highest = 0
for language, result in benchmark_result.items():
score = get_score(min(result["times"]))
if score > highest: highest = score
print("{0:30s}0 {1:66.0f}".format("", highest))
for language, result in benchmark_result.items():
line = ["-"] * 68
for time in result["times"]:
index = int(get_score(time) / highest * 67)
line[index] = INCREMENT[line[index]]
print("{0:30s}{1}".format(result["desc"], "".join(line)))
print()
def read_baseline():
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
if os.path.exists(baseline_file):
with open(baseline_file) as f:
for line in f.readlines():
name, best = line.split(",")
for benchmark in BENCHMARKS:
if benchmark[0] == name:
benchmark[2] = float(best)
def generate_baseline():
print("generating baseline")
baseline_text = ""
for benchmark in BENCHMARKS:
best = run_benchmark_language(benchmark, LANGUAGES[0], {})
baseline_text += ("{},{}\n".format(benchmark[0], best))
# Write them to a file.
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
with open(baseline_file, 'w') as out:
out.write(baseline_text)
def print_html():
'''Print the results as an HTML chart.'''
def print_benchmark(benchmark, name):
print('<h3>{}</h3>'.format(name))
print('<table class="chart">')
# Scale everything by the highest time.
highest = 0
for language, result in results[benchmark].items():
time = min(result["times"])
if time > highest: highest = time
languages = sorted(results[benchmark].keys(),
key=lambda lang: results[benchmark][lang]["score"], reverse=True)
for language in languages:
result = results[benchmark][language]
time = float(min(result["times"]))
ratio = int(100 * time / highest)
css_class = "chart-bar"
if language == "wren":
css_class += " wren"
print(' <tr>')
print(' <th>{}</th><td><div class="{}" style="width: {}%;">{:4.2f}s&nbsp;</div></td>'.format(
language, css_class, ratio, time))
print(' </tr>')
print('</table>')
print_benchmark("method_call", "Method Call")
print_benchmark("delta_blue", "DeltaBlue")
print_benchmark("binary_trees", "Binary Trees")
print_benchmark("fib", "Recursive Fibonacci")
def main():
parser = argparse.ArgumentParser(description="Run the benchmarks")
parser.add_argument("benchmark", nargs='?',
default="all",
help="The benchmark to run")
parser.add_argument("--generate-baseline",
action="store_true",
help="Generate a baseline file")
parser.add_argument("-l", "--language",
action="append",
help="Which language(s) to run benchmarks for")
parser.add_argument("--output-html",
action="store_true",
help="Output the results chart as HTML")
args = parser.parse_args()
if args.generate_baseline:
generate_baseline()
return
read_baseline()
# Run the benchmarks.
for benchmark in BENCHMARKS:
if benchmark[0] == args.benchmark or args.benchmark == "all":
run_benchmark(benchmark, args.language)
if args.output_html:
print_html()
main()