mirror of
https://github.com/wren-lang/wren.git
synced 2026-01-11 06:08:41 +01:00
This is a breaking change because existing imports in user Wren code that assume the path is relative to the entrypoint file will now likely fail. Also, stack trace output and host API calls that take a module string now need the resolved module string, not the short name that appears in the import.
378 lines
10 KiB
Python
Executable File
378 lines
10 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
from __future__ import print_function
|
|
|
|
import argparse
|
|
import math
|
|
import os
|
|
import os.path
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
# Runs the benchmarks.
|
|
#
|
|
# It runs several benchmarks across several languages. For each
|
|
# benchmark/language pair, it runs a number of trials. Each trial is one run of
|
|
# a single benchmark script. It spawns a process and runs the script. The
|
|
# script itself is expected to output some result which this script validates
|
|
# to ensure the benchmark is running correctly. Then the benchmark prints an
|
|
# elapsed time. The benchmark is expected to do the timing itself and only time
|
|
# the interesting code under test.
|
|
#
|
|
# This script then runs several trials and takes the best score. (It does
|
|
# multiple trials to account for random variance in running time coming from
|
|
# OS, CPU rate-limiting, etc.) It takes the best time on the assumption that
|
|
# that represents the language's ideal performance and any variance coming from
|
|
# the OS will just slow it down.
|
|
#
|
|
# After running a series of trials the benchmark runner will compare all of the
|
|
# language's performance for a given benchmark. It compares by running time
|
|
# and score, which is just the inverse running time.
|
|
#
|
|
# For Wren benchmarks, it can also compare against a "baseline". That's a
|
|
# recorded result of a previous run of the Wren benchmarks. This is useful --
|
|
# critical, actually -- for seeing how Wren performance changes. Generating a
|
|
# set of baselines before a change to the VM and then comparing those to the
|
|
# performance after a change is how we track improvements and regressions.
|
|
#
|
|
# To generate a baseline file, run this script with "--generate-baseline".
|
|
|
|
WREN_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
WREN_BIN = os.path.join(WREN_DIR, 'bin')
|
|
BENCHMARK_DIR = os.path.join('test', 'benchmark')
|
|
|
|
# How many times to run a given benchmark.
|
|
NUM_TRIALS = 10
|
|
|
|
BENCHMARKS = []
|
|
|
|
def BENCHMARK(name, pattern):
|
|
regex = re.compile(pattern + "\n" + r"elapsed: (\d+\.\d+)", re.MULTILINE)
|
|
BENCHMARKS.append([name, regex, None])
|
|
|
|
BENCHMARK("api_call", "true")
|
|
|
|
BENCHMARK("api_foreign_method", "100000000")
|
|
|
|
BENCHMARK("binary_trees", """stretch tree of depth 13 check: -1
|
|
8192 trees of depth 4 check: -8192
|
|
2048 trees of depth 6 check: -2048
|
|
512 trees of depth 8 check: -512
|
|
128 trees of depth 10 check: -128
|
|
32 trees of depth 12 check: -32
|
|
long lived tree of depth 12 check: -1""")
|
|
|
|
BENCHMARK("binary_trees_gc", """stretch tree of depth 13 check: -1
|
|
8192 trees of depth 4 check: -8192
|
|
2048 trees of depth 6 check: -2048
|
|
512 trees of depth 8 check: -512
|
|
128 trees of depth 10 check: -128
|
|
32 trees of depth 12 check: -32
|
|
long lived tree of depth 12 check: -1""")
|
|
|
|
BENCHMARK("delta_blue", "14065400")
|
|
|
|
BENCHMARK("fib", r"""317811
|
|
317811
|
|
317811
|
|
317811
|
|
317811""")
|
|
|
|
BENCHMARK("fibers", r"""4999950000""")
|
|
|
|
BENCHMARK("for", r"""499999500000""")
|
|
|
|
BENCHMARK("method_call", r"""true
|
|
false""")
|
|
|
|
BENCHMARK("map_numeric", r"""500000500000""")
|
|
|
|
BENCHMARK("map_string", r"""12799920000""")
|
|
|
|
BENCHMARK("string_equals", r"""3000000""")
|
|
|
|
LANGUAGES = [
|
|
("wren", [os.path.join(WREN_BIN, 'wren')], ".wren"),
|
|
("dart", ["fletch", "run"], ".dart"),
|
|
("lua", ["lua"], ".lua"),
|
|
("luajit (-joff)", ["luajit", "-joff"], ".lua"),
|
|
("python", ["python"], ".py"),
|
|
("python3", ["python3"], ".py"),
|
|
("ruby", ["ruby"], ".rb")
|
|
]
|
|
|
|
results = {}
|
|
|
|
if sys.platform == 'win32':
|
|
GREEN = NORMAL = RED = YELLOW = ''
|
|
else:
|
|
GREEN = '\033[32m'
|
|
NORMAL = '\033[0m'
|
|
RED = '\033[31m'
|
|
YELLOW = '\033[33m'
|
|
|
|
def green(text):
|
|
return GREEN + text + NORMAL
|
|
|
|
def red(text):
|
|
return RED + text + NORMAL
|
|
|
|
def yellow(text):
|
|
return YELLOW + text + NORMAL
|
|
|
|
|
|
def get_score(time):
|
|
"""
|
|
Converts time into a "score". This is the inverse of the time with an
|
|
arbitrary scale applied to get the number in a nice range. The goal here is
|
|
to have benchmark results where faster = bigger number.
|
|
"""
|
|
return 1000.0 / time
|
|
|
|
|
|
def standard_deviation(times):
|
|
"""
|
|
Calculates the standard deviation of a list of numbers.
|
|
"""
|
|
mean = sum(times) / len(times)
|
|
|
|
# Sum the squares of the differences from the mean.
|
|
result = 0
|
|
for time in times:
|
|
result += (time - mean) ** 2
|
|
|
|
return math.sqrt(result / len(times))
|
|
|
|
|
|
def run_trial(benchmark, language):
|
|
"""Runs one benchmark one time for one language."""
|
|
executable_args = language[1]
|
|
|
|
# Hackish. If the benchmark name starts with "api_", it's testing the Wren
|
|
# C API, so run the test_api executable which has those test methods instead
|
|
# of the normal Wren build.
|
|
if benchmark[0].startswith("api_"):
|
|
executable_args = [
|
|
os.path.join(WREN_DIR, "build", "release", "test", "api_wren")
|
|
]
|
|
|
|
args = []
|
|
args.extend(executable_args)
|
|
args.append(os.path.join(BENCHMARK_DIR, benchmark[0] + language[2]))
|
|
|
|
try:
|
|
out = subprocess.check_output(args, universal_newlines=True)
|
|
except OSError:
|
|
print('Interpreter was not found')
|
|
return None
|
|
match = benchmark[1].match(out)
|
|
if match:
|
|
return float(match.group(1))
|
|
else:
|
|
print("Incorrect output:")
|
|
print(out)
|
|
return None
|
|
|
|
|
|
def run_benchmark_language(benchmark, language, benchmark_result):
|
|
"""
|
|
Runs one benchmark for a number of trials for one language.
|
|
|
|
Adds the result to benchmark_result, which is a map of language names to
|
|
results.
|
|
"""
|
|
|
|
name = "{0} - {1}".format(benchmark[0], language[0])
|
|
print("{0:30s}".format(name), end=' ')
|
|
|
|
if not os.path.exists(os.path.join(
|
|
BENCHMARK_DIR, benchmark[0] + language[2])):
|
|
print("No implementation for this language")
|
|
return
|
|
|
|
times = []
|
|
for i in range(0, NUM_TRIALS):
|
|
sys.stdout.flush()
|
|
time = run_trial(benchmark, language)
|
|
if not time:
|
|
return
|
|
times.append(time)
|
|
sys.stdout.write(".")
|
|
|
|
best = min(times)
|
|
score = get_score(best)
|
|
|
|
comparison = ""
|
|
if language[0] == "wren":
|
|
if benchmark[2] != None:
|
|
ratio = 100 * score / benchmark[2]
|
|
comparison = "{:6.2f}% relative to baseline".format(ratio)
|
|
if ratio > 105:
|
|
comparison = green(comparison)
|
|
if ratio < 95:
|
|
comparison = red(comparison)
|
|
else:
|
|
comparison = "no baseline"
|
|
else:
|
|
# Hack: assumes wren gets run first.
|
|
wren_score = benchmark_result["wren"]["score"]
|
|
ratio = 100.0 * wren_score / score
|
|
comparison = "{:6.2f}%".format(ratio)
|
|
if ratio > 105:
|
|
comparison = green(comparison)
|
|
if ratio < 95:
|
|
comparison = red(comparison)
|
|
|
|
print(" {:4.2f}s {:4.4f} {:s}".format(
|
|
best,
|
|
standard_deviation(times),
|
|
comparison))
|
|
|
|
benchmark_result[language[0]] = {
|
|
"desc": name,
|
|
"times": times,
|
|
"score": score
|
|
}
|
|
|
|
return score
|
|
|
|
|
|
def run_benchmark(benchmark, languages, graph):
|
|
"""Runs one benchmark for the given languages (or all of them)."""
|
|
|
|
benchmark_result = {}
|
|
results[benchmark[0]] = benchmark_result
|
|
|
|
num_languages = 0
|
|
for language in LANGUAGES:
|
|
if not languages or language[0] in languages:
|
|
num_languages += 1
|
|
run_benchmark_language(benchmark, language, benchmark_result)
|
|
|
|
if num_languages > 1 and graph:
|
|
graph_results(benchmark_result)
|
|
|
|
|
|
def graph_results(benchmark_result):
|
|
print()
|
|
|
|
INCREMENT = {
|
|
'-': 'o',
|
|
'o': 'O',
|
|
'O': '0',
|
|
'0': '0'
|
|
}
|
|
|
|
# Scale everything by the highest score.
|
|
highest = 0
|
|
for language, result in benchmark_result.items():
|
|
score = get_score(min(result["times"]))
|
|
if score > highest: highest = score
|
|
|
|
print("{0:30s}0 {1:66.0f}".format("", highest))
|
|
for language, result in benchmark_result.items():
|
|
line = ["-"] * 68
|
|
for time in result["times"]:
|
|
index = int(get_score(time) / highest * 67)
|
|
line[index] = INCREMENT[line[index]]
|
|
print("{0:30s}{1}".format(result["desc"], "".join(line)))
|
|
print()
|
|
|
|
|
|
def read_baseline():
|
|
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
|
|
if os.path.exists(baseline_file):
|
|
with open(baseline_file) as f:
|
|
for line in f.readlines():
|
|
name, best = line.split(",")
|
|
for benchmark in BENCHMARKS:
|
|
if benchmark[0] == name:
|
|
benchmark[2] = float(best)
|
|
|
|
|
|
def generate_baseline():
|
|
print("generating baseline")
|
|
baseline_text = ""
|
|
for benchmark in BENCHMARKS:
|
|
best = run_benchmark_language(benchmark, LANGUAGES[0], {})
|
|
baseline_text += ("{},{}\n".format(benchmark[0], best))
|
|
|
|
# Write them to a file.
|
|
baseline_file = os.path.join(BENCHMARK_DIR, "baseline.txt")
|
|
with open(baseline_file, 'w') as out:
|
|
out.write(baseline_text)
|
|
|
|
|
|
def print_html():
|
|
'''Print the results as an HTML chart.'''
|
|
|
|
def print_benchmark(benchmark, name):
|
|
print('<h3>{}</h3>'.format(name))
|
|
print('<table class="chart">')
|
|
|
|
# Scale everything by the highest time.
|
|
highest = 0
|
|
for language, result in results[benchmark].items():
|
|
time = min(result["times"])
|
|
if time > highest: highest = time
|
|
|
|
languages = sorted(results[benchmark].keys(),
|
|
key=lambda lang: results[benchmark][lang]["score"], reverse=True)
|
|
|
|
for language in languages:
|
|
result = results[benchmark][language]
|
|
time = float(min(result["times"]))
|
|
ratio = int(100 * time / highest)
|
|
css_class = "chart-bar"
|
|
if language == "wren":
|
|
css_class += " wren"
|
|
print(' <tr>')
|
|
print(' <th>{}</th><td><div class="{}" style="width: {}%;">{:4.2f}s </div></td>'.format(
|
|
language, css_class, ratio, time))
|
|
print(' </tr>')
|
|
print('</table>')
|
|
|
|
print_benchmark("method_call", "Method Call")
|
|
print_benchmark("delta_blue", "DeltaBlue")
|
|
print_benchmark("binary_trees", "Binary Trees")
|
|
print_benchmark("fib", "Recursive Fibonacci")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Run the benchmarks")
|
|
parser.add_argument("benchmark", nargs='?',
|
|
default="all",
|
|
help="The benchmark to run")
|
|
parser.add_argument("--generate-baseline",
|
|
action="store_true",
|
|
help="Generate a baseline file")
|
|
parser.add_argument("--graph",
|
|
action="store_true",
|
|
help="Display graph results.")
|
|
parser.add_argument("-l", "--language",
|
|
action="append",
|
|
help="Which language(s) to run benchmarks for")
|
|
parser.add_argument("--output-html",
|
|
action="store_true",
|
|
help="Output the results chart as HTML")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.generate_baseline:
|
|
generate_baseline()
|
|
return
|
|
|
|
read_baseline()
|
|
|
|
# Run the benchmarks.
|
|
for benchmark in BENCHMARKS:
|
|
if benchmark[0] == args.benchmark or args.benchmark == "all":
|
|
run_benchmark(benchmark, args.language, args.graph)
|
|
|
|
if args.output_html:
|
|
print_html()
|
|
|
|
|
|
main()
|