#!/usr/bin/env python from __future__ import print_function import argparse import math import os import os.path import re import subprocess import sys # Runs the tests. WREN_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) BENCHMARK_DIR = os.path.join(WREN_DIR, 'benchmark') # How many times to run a given benchmark. NUM_TRIALS = 10 BENCHMARKS = [] def BENCHMARK(name, pattern): regex = re.compile(pattern + "\n" + r"elapsed: (\d+\.\d+)", re.MULTILINE) BENCHMARKS.append([name, regex, None]) BENCHMARK("binary_trees", """stretch tree of depth 13 check: -1 8192 trees of depth 4 check: -8192 2048 trees of depth 6 check: -2048 512 trees of depth 8 check: -512 128 trees of depth 10 check: -128 32 trees of depth 12 check: -32 long lived tree of depth 12 check: -1""") BENCHMARK("delta_blue", "7032700") BENCHMARK("fib", r"""317811 317811 317811 317811 317811""") BENCHMARK("for", r"""499999500000""") BENCHMARK("method_call", r"""true false""") BENCHMARK("map_numeric", r"""500000500000""") BENCHMARK("map_string", r"""3645600""") LANGUAGES = [ ("wren", [os.path.join(WREN_DIR, 'wren')], ".wren"), ("lua", ["lua"], ".lua"), ("luajit (-joff)", ["luajit", "-joff"], ".lua"), ("python", ["python"], ".py"), ("python3", ["python3"], ".py"), ("ruby", ["ruby"], ".rb") ] results = {} def green(text): if sys.platform == 'win32': return text return '\033[32m' + text + '\033[0m' def red(text): if sys.platform == 'win32': return text return '\033[31m' + text + '\033[0m' def yellow(text): if sys.platform == 'win32': return text return '\033[33m' + text + '\033[0m' def get_score(time): """ Converts time into a "score". This is the inverse of the time with an arbitrary scale applied to get the number in a nice range. The goal here is to have benchmark results where faster = bigger number. """ return 1000.0 / time def run_trial(benchmark, language): """Runs one benchmark one time for one language.""" args = [] args.extend(language[1]) args.append(os.path.join(BENCHMARK_DIR, benchmark[0] + language[2])) try: out = subprocess.check_output(args, universal_newlines=True) except OSError: print('Interpreter was not found') return None match = benchmark[1].match(out) if match: return float(match.group(1)) else: print("Incorrect output:") print(out) return None def run_benchmark_language(benchmark, language, benchmark_result): """ Runs one benchmark for a number of trials for one language. Adds the result to benchmark_result, which is a map of language names to results. """ name = "{0} - {1}".format(benchmark[0], language[0]) print("{0:30s}".format(name), end=' ') if not os.path.exists(os.path.join( BENCHMARK_DIR, benchmark[0] + language[2])): print("No implementation for this language") return times = [] for i in range(0, NUM_TRIALS): time = run_trial(benchmark, language) if not time: return times.append(time) sys.stdout.write(".") best = min(times) score = get_score(best) comparison = "" if language[0] == "wren": if benchmark[2] != None: ratio = 100 * score / benchmark[2] comparison = "{:6.2f}% relative to baseline".format(ratio) if ratio > 105: comparison = green(comparison) if ratio < 95: comparison = red(comparison) else: comparison = "no baseline" else: # Hack: assumes wren gets run first. wren_score = benchmark_result["wren"]["score"] ratio = 100.0 * wren_score / score comparison = "{:6.2f}%".format(ratio) if ratio > 105: comparison = green(comparison) if ratio < 95: comparison = red(comparison) print(" {:5.0f} {:4.2f}s {:s}".format(score, best, comparison)) benchmark_result[language[0]] = { "desc": name, "times": times, "score": score } return score def run_benchmark(benchmark, languages): """Runs one benchmark for the given languages (or all of them).""" benchmark_result = {} results[benchmark[0]] = benchmark_result num_languages = 0 for language in LANGUAGES: if not languages or language[0] in languages: num_languages += 1 run_benchmark_language(benchmark, language, benchmark_result) if num_languages > 1: graph_results(benchmark_result) def graph_results(benchmark_result): print() INCREMENT = { '-': 'o', 'o': 'O', 'O': '0', '0': '0' } # Scale everything by the highest score. highest = 0 for language, result in benchmark_result.items(): score = get_score(min(result["times"])) if score > highest: highest = score print("{0:30s}0 {1:66.0f}".format("", highest)) for language, result in benchmark_result.items(): line = ["-"] * 68 for time in result["times"]: index = int(get_score(time) / highest * 67) line[index] = INCREMENT[line[index]] print("{0:30s}{1}".format(result["desc"], "".join(line))) print() def read_baseline(): if os.path.exists("baseline.txt"): with open("baseline.txt") as f: for line in f.readlines(): name, best = line.split(",") for benchmark in BENCHMARKS: if benchmark[0] == name: benchmark[2] = float(best) def generate_baseline(): print("generating baseline") baseline_text = "" for benchmark in BENCHMARKS: best = run_benchmark_language(benchmark, LANGUAGES[0], {}) baseline_text += ("{},{}\n".format(benchmark[0], best)) # Write them to a file. with open("baseline.txt", 'w') as out: out.write(baseline_text) def print_html(): '''Print the results as an HTML chart.''' def print_benchmark(benchmark, name): print('

{}

'.format(name)) print('') # Scale everything by the highest time. highest = 0 for language, result in results[benchmark].items(): time = min(result["times"]) if time > highest: highest = time languages = sorted(results[benchmark].keys(), key=lambda lang: results[benchmark][lang]["score"], reverse=True) for language in languages: result = results[benchmark][language] time = float(min(result["times"])) ratio = int(100 * time / highest) css_class = "chart-bar" if language == "wren": css_class += " wren" print(' ') print(' '.format( language, css_class, ratio, time)) print(' ') print('
{}
{:4.2f}s 
') print_benchmark("method_call", "Method Call") print_benchmark("delta_blue", "DeltaBlue") print_benchmark("binary_trees", "Binary Trees") print_benchmark("fib", "Recursive Fibonacci") def main(): parser = argparse.ArgumentParser(description="Run the benchmarks") parser.add_argument("benchmark", nargs='?', default="all", help="The benchmark to run") parser.add_argument("--generate-baseline", action="store_true", help="Generate a baseline file") parser.add_argument("-l", "--language", action="append", help="Which language(s) to run benchmarks for") parser.add_argument("--output-html", action="store_true", help="Output the results chart as HTML") args = parser.parse_args() if args.generate_baseline: generate_baseline() return read_baseline() # Run the benchmarks. for benchmark in BENCHMARKS: if benchmark[0] == args.benchmark or args.benchmark == "all": run_benchmark(benchmark, args.language) if args.output_html: print_html() main()