Benchmark chart.

2014-04-20 21:04:41 -07:00
parent d338ef2f07
commit 767e47bbf6
3 changed files with 197 additions and 34 deletions
--- a/benchmark/run_bench
+++ b/benchmark/run_bench
@ -51,7 +51,7 @@ LANGUAGES = [
  ("ruby",           ["ruby"],                  ".rb")
 ]

-results = []
+results = {}

 def green(text):
  if sys.platform == 'win32':
@ -93,8 +93,14 @@ def run_trial(benchmark, language):
    return None


-def run_benchmark_language(benchmark, language):
-  """Runs one benchmark for a number of trials for one language."""
+def run_benchmark_language(benchmark, language, benchmark_result):
+  """
+  Runs one benchmark for a number of trials for one language.
+
+  Adds the result to benchmark_result, which is a map of language names to
+  results.
+  """
+
  name = "{0} - {1}".format(benchmark[0], language[0])
  print "{0:30s}".format(name),

@ -126,8 +132,8 @@ def run_benchmark_language(benchmark, language):
    else:
      comparison = "no baseline"
  else:
-    # Hack: assumes wren is first language.
-    wren_score = results[0][2]
+    # Hack: assumes wren gets run first.
+    wren_score = benchmark_result["wren"]["score"]
    ratio = 100.0 * wren_score / score
    comparison =  "{:6.2f}%".format(ratio)
    if ratio > 105:
@ -137,25 +143,32 @@ def run_benchmark_language(benchmark, language):

  print " {:5.0f}  {:4.2f}s  {:s}".format(score, best, comparison)

-  results.append([name, times, score])
+  benchmark_result[language[0]] = {
+    "desc": name,
+    "times": times,
+    "score": score
+  }
+
  return score


 def run_benchmark(benchmark, languages):
  """Runs one benchmark for the given languages (or all of them)."""
+
+  benchmark_result = {}
+  results[benchmark[0]] = benchmark_result
+
  num_languages = 0
  for language in LANGUAGES:
    if not languages or language[0] in languages:
      num_languages += 1
-      run_benchmark_language(benchmark, language)
+      run_benchmark_language(benchmark, language, benchmark_result)

  if num_languages > 1:
-    graph_results()
-
-  del results[0:len(results)]
+    graph_results(benchmark_result)


-def graph_results():
+def graph_results(benchmark_result):
  print

  INCREMENT = {
@ -167,17 +180,17 @@ def graph_results():

  # Scale everything by the highest score.
  highest = 0
-  for result in results:
-    score = get_score(min(result[1]))
+  for language, result in benchmark_result.items():
+    score = get_score(min(result["times"]))
    if score > highest: highest = score

  print "{0:30s}0 {1:66.0f}".format("", highest)
-  for result in results:
+  for language, result in benchmark_result.items():
    line = ["-"] * 68
-    for time in result[1]:
+    for time in result["times"]:
      index = int(get_score(time) / highest * 67)
      line[index] = INCREMENT[line[index]]
-    print "{0:30s}{1}".format(result[0], "".join(line))
+    print "{0:30s}{1}".format(result["desc"], "".join(line))
  print


@ -195,7 +208,7 @@ def generate_baseline():
  print "generating baseline"
  baseline_text = ""
  for benchmark in BENCHMARKS:
-    best = run_benchmark_language(benchmark, LANGUAGES[0])
+    best = run_benchmark_language(benchmark, LANGUAGES[0], {})
    baseline_text += ("{},{}\n".format(benchmark[0], best))

  # Write them to a file.
@ -203,6 +216,41 @@ def generate_baseline():
    out.write(baseline_text)


+def print_html():
+  '''Print the results as an HTML chart.'''
+
+  def print_benchmark(benchmark, name):
+    print '<h3>{}</h3>'.format(name)
+    print '<table class="chart">'
+
+    # Scale everything by the highest score.
+    highest = 0
+    for language, result in results[benchmark].items():
+      score = get_score(min(result["times"]))
+      if score > highest: highest = score
+
+    languages = sorted(results[benchmark].keys(),
+        key=lambda lang: results[benchmark][lang]["score"], reverse=True)
+
+    for language in languages:
+      result = results[benchmark][language]
+      score = int(result["score"])
+      ratio = int(100 * score / highest)
+      css_class = "chart-bar"
+      if language == "wren":
+        css_class += " wren"
+      print '  <tr>'
+      print '    <th>{}</th><td><div class="{}" style="width: {}%;">{}&nbsp;</div></td>'.format(
+          language, css_class, ratio, score)
+      print '  </tr>'
+    print '</table>'
+
+  print_benchmark("method_call", "Method Call")
+  print_benchmark("delta_blue", "DeltaBlue")
+  print_benchmark("binary_trees", "Binary Trees")
+  print_benchmark("fib", "Recursive Fibonacci")
+
+
 def main():
  parser = argparse.ArgumentParser(description="Run the benchmarks")
  parser.add_argument("benchmark", nargs='?',
@ -214,6 +262,9 @@ def main():
  parser.add_argument("-l", "--language",
      action="append",
      help="Which language(s) to run benchmarks for")
+  parser.add_argument("--output-html",
+      action="store_true",
+      help="Output the results chart as HTML")

  args = parser.parse_args()

@ -223,15 +274,13 @@ def main():

  read_baseline()

-  # Run all benchmarks.
-  if args.benchmark == "all":
-    for benchmark in BENCHMARKS:
-      run_benchmark(benchmark, args.language)
-    return
-
-  # Run the given benchmark.
+  # Run the benchmarks.
  for benchmark in BENCHMARKS:
-    if benchmark[0] == args.benchmark:
+    if benchmark[0] == args.benchmark or args.benchmark == "all":
      run_benchmark(benchmark, args.language)

+  if args.output_html:
+    print_html()
+
+
 main()
--- a/doc/site/performance.markdown
+++ b/doc/site/performance.markdown
@ -1,6 +1,92 @@
 ^title Performance
 ^category reference

+Even though most benchmarks aren't worth the pixels they're printed on, people seem to like them, so here's a few:
+
+<h3>Method Call</h3>
+<table class="chart">
+  <tr>
+    <th>wren</th><td><div class="chart-bar wren" style="width: 99%;">4930&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>luajit (-joff)</th><td><div class="chart-bar" style="width: 86%;">4266&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>ruby</th><td><div class="chart-bar" style="width: 55%;">2752&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>lua</th><td><div class="chart-bar" style="width: 35%;">1728&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python3</th><td><div class="chart-bar" style="width: 17%;">865&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python</th><td><div class="chart-bar" style="width: 15%;">764&nbsp;</div></td>
+  </tr>
+</table>
+<h3>DeltaBlue</h3>
+<table class="chart">
+  <tr>
+    <th>wren</th><td><div class="chart-bar wren" style="width: 99%;">6662&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python3</th><td><div class="chart-bar" style="width: 35%;">2336&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python</th><td><div class="chart-bar" style="width: 32%;">2166&nbsp;</div></td>
+  </tr>
+</table>
+<h3>Binary Trees</h3>
+<table class="chart">
+  <tr>
+    <th>luajit (-joff)</th><td><div class="chart-bar" style="width: 99%;">6442&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>wren</th><td><div class="chart-bar wren" style="width: 50%;">3253&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>ruby</th><td><div class="chart-bar" style="width: 43%;">2806&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python3</th><td><div class="chart-bar" style="width: 29%;">1926&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>lua</th><td><div class="chart-bar" style="width: 21%;">1375&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python</th><td><div class="chart-bar" style="width: 21%;">1353&nbsp;</div></td>
+  </tr>
+</table>
+<h3>Recursive Fibonacci</h3>
+<table class="chart">
+  <tr>
+    <th>luajit (-joff)</th><td><div class="chart-bar" style="width: 99%;">6869&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>ruby</th><td><div class="chart-bar" style="width: 45%;">3138&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>wren</th><td><div class="chart-bar wren" style="width: 43%;">2991&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>lua</th><td><div class="chart-bar" style="width: 42%;">2889&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python3</th><td><div class="chart-bar" style="width: 18%;">1280&nbsp;</div></td>
+  </tr>
+  <tr>
+    <th>python</th><td><div class="chart-bar" style="width: 18%;">1253&nbsp;</div></td>
+  </tr>
+</table>
+
+Higher scores (longer bars) are better. The score is the inverse of the running time, so if one language's score is twice another's, that means the language is twice as fast. Each benchmark is run ten times and the best time is kept. It only measures the time taken to execute the benchmarked code itself, not interpreter startup.
+
+These were run on my MacBook Pro 2.3 GHz Intel Core i7 with 16 GB of 1,600 MHz DDR3 RAM. Tested against Lua 5.2.3, LuaJIT 2.0.2, Python 2.7.5, Python 3.3.4, ruby 2.0.0p247. LuaJIT is run with the JIT *disabled* (i.e. in bytecode interpreter mode) since all of the other languages are bytecode interpreters. LuaJIT with the JIT enabled is *much* faster than all of the other languages benchmarked, including Wren, because Mike Pall is a robot from the future.
+
+The benchmark harness and programs are [here](https://github.com/munificent/wren/tree/master/benchmark).
+
+## Why is Wren fast?
+
 Languages come in four rough performance buckets, from slowest to fastest:

 1.  Tree-walk interpreters: Ruby 1.8.7 and earlier, Io, that
@ -16,9 +102,7 @@ Languages come in four rough performance buckets, from slowest to fastest:

 Most languages in the first bucket aren't suitable for production use. (Servers are one exception, because you can throw more hardware at a slow language there.) Languages in the second bucket are fast enough for many use cases, even on client hardware, as the success of the listed languages shows. Languages in the third bucket are quite fast, but their implementations are breathtakingly complex, often rivaling that of compilers for statically-typed languages.

-## Why is Wren fast?
-
-Wren is in the second bucket. If you want a simple implementation that's fast enough for real use, this is the sweet spot. Despite being younger and having a smaller codebase, Wren's performance is quite competitive with other languages in that bucket. It has a few tricks up its sleeve:
+Wren is in the second bucket. If you want a simple implementation that's fast enough for real use, this is the sweet spot. In addition, Wren has a few tricks up its sleeve:

 ### A compact value representation

@ -69,9 +153,3 @@ Most of Wren's performance comes from language design decisions. While it's dyna
 Wren's closest sibling, by far, is Lua. Lua is more dynamic than Wren which makes its job harder. Lua also tries very hard to be compatible across a wide range of hardware and compilers. If you have a C89 compiler for it, odds are very good that you can run Lua on it.

 Wren cares about compatibility, but it requires C99 and IEEE double precision floats. That may exclude some edge case hardware, but makes things like NaN tagging, computed gotos, and some other tricks possible.
-
-## Do you have benchmarks to prove this?
-
-Benchmarks are somewhere between an art and a carnival game. They can easily be manipulated to show what you want. But, yes, there are several benchmarks in the repo.
-
-**TODO: chart**
--- a/doc/site/style.scss
+++ b/doc/site/style.scss
@ -189,6 +189,10 @@ a {
  outline: none;
 }

+main {
+  padding-top: 12px;
+}
+
 a:hover {
  color: $blue-dark;
 }
@ -299,6 +303,38 @@ body.reference {
  }
 }

+// Bar charts on the performance page.
+table.chart {
+  width: 100%;
+
+  td, th {
+    line-height: 14px;
+    margin: 0;
+    padding: 0;
+  }
+
+  th {
+    font-size: 14px;
+    text-align: left;
+    width: 100px;
+  }
+
+  .chart-bar {
+    display: inline-block;
+    font: 13px $body;
+    color: $light;
+    background: $blue;
+    border-bottom: solid 1px $blue-dark;
+    text-align: right;
+    border-radius: 2px;
+  }
+
+  .chart-bar.wren {
+    background: mix($blue, $blue-dark, 30%);
+    border-bottom: solid 1px $blue-dark;
+  }
+}
+
 /*
@media only screen and (max-width: 639px) {
  // Shrink the window padding.