From a5b00cebe788db5b7bda1917106ae93f1327af2a Mon Sep 17 00:00:00 2001
From: Bob Nystrom <robert@stuffwithstuff.com>
Date: Thu, 22 Jan 2015 16:38:03 -0800
Subject: [PATCH] Clarify how string subscripting handles UTF-8.

---
 doc/site/core/string.markdown     | 59 ++++++++++++++++++++++++++-----
 script/generate_docs.py           | 13 ++++---
 src/wren_core.c                   |  9 ++---
 src/wren_value.c                  | 24 +++++++++++++
 src/wren_value.h                  |  5 +++
 test/string/literals.wren         |  3 ++
 test/string/subscript.wren        | 25 +++++++++++--
 test/string/utf_8_in_literal.wren |  3 --
 8 files changed, 115 insertions(+), 26 deletions(-)
 delete mode 100644 test/string/utf_8_in_literal.wren

diff --git a/doc/site/core/string.markdown b/doc/site/core/string.markdown
index 1ea51140..88328b5c 100644
--- a/doc/site/core/string.markdown
+++ b/doc/site/core/string.markdown
@@ -1,7 +1,37 @@
 ^title String Class
 ^category core
 
-A string of Unicode code points stored in UTF-8.
+Strings are immutable chunks of text. More formally, a string is a sequence of
+Unicode code points encoded in UTF-8.
+
+If you never work with any characters outside of the ASCII range, you can treat
+strings like a directly indexable array of characters. Once other characters
+get involved, it's important to understand the distinction.
+
+In UTF-8, a single Unicode code point (very roughly a single "character") may
+be encoded as one or more bytes. This means you can't directly index by code
+point. There's no way to find, say, the fifth code unit in a string without
+walking the string from the beginning and counting them as you go.
+
+Because counting code units is relatively slow, string methods generally index
+by *byte*, not *code unit*. When you do:
+
+    :::dart
+    someString[3]
+
+That means "get the code unit starting at *byte* three", not "get the third
+code unit in the string". This sounds scary, but keep in mind that the methods
+on string *return* byte indices too. So, for example, this does what you want:
+
+    :::dart
+    var metalBand = "Fäcëhämmër"
+    var hPosition = metalBand.indexOf("h")
+    IO.print(metalBand[hPosition]) // "h"
+
+In general, methods on strings will work in terms of code units if they can do
+so efficiently, and will otherwise deal in bytes.
+
+## Methods
 
 ### **contains**(other)
 
@@ -13,20 +43,20 @@ It is a runtime error if `other` is not a string.
 
 Returns the length of the string.
 
-### **endsWith(suffix)**
+### **endsWith**(suffix)
 
 Checks if the string ends with `suffix`.
 
 It is a runtime error if `suffix` is not a string.
 
-### **indexOf(search)**
+### **indexOf**(search)
 
-Returns the index of `search` in the string or -1 if `search` is not a
-substring of the string.
+Returns the index of the first byte matching `search` in the string or `-1` if
+`search` was not found.
 
 It is a runtime error if `search` is not a string.
 
-### **startsWith(prefix)**
+### **startsWith**(prefix)
 
 Checks if the string starts with `prefix`.
 
@@ -48,8 +78,19 @@ Check if the string is not equal to `other`.
 
 ### **[**index**]** operator
 
-Returns a one character string of the value at `index`.
+Returns a string containing the code unit starting at byte `index`.
 
-It is a runtime error if `index` is greater than the length of the string.
+    :::dart
+    IO.print("ʕ•ᴥ•ʔ"[5]) // "ᴥ".
 
-*Note: This does not currently handle UTF-8 characters correctly.*
+Since `ʕ` is two bytes in UTF-8 and `•` is three, the fifth byte points to the
+bear's nose.
+
+If `index` points into the middle of a UTF-8 sequence, this returns an empty
+string:
+
+    :::dart
+    IO.print("I ♥ NY"[3]) // "".
+
+It is a runtime error if `index` is greater than the number of bytes in the
+string.
diff --git a/script/generate_docs.py b/script/generate_docs.py
index 3c03b645..ac56c311 100755
--- a/script/generate_docs.py
+++ b/script/generate_docs.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+import codecs
 import glob
 import fnmatch
 import os
@@ -12,10 +13,12 @@ from datetime import datetime
 
 import markdown
 
-with open("doc/site/template.html") as f:
+
+with codecs.open("doc/site/template.html", encoding="utf-8") as f:
   template = f.read()
 
-with open("doc/site/template-core.html") as f:
+
+with codecs.open("doc/site/template-core.html", encoding="utf-8") as f:
   template_core = f.read()
 
 
@@ -50,7 +53,7 @@ def format_file(path, skip_up_to_date):
 
   # Read the markdown file and preprocess it.
   contents = ""
-  with open(in_path, "r") as input:
+  with codecs.open(in_path, "r", encoding="utf-8") as input:
     # Read each line, preprocessing the special codes.
     for line in input:
       stripped = line.lstrip()
@@ -73,7 +76,7 @@ def format_file(path, skip_up_to_date):
         headertype = stripped[:index]
         header = stripped[index:].strip()
         anchor = header.lower().replace(' ', '-')
-        anchor = re.compile('\.|\?|!|:|/').sub('', anchor)
+        anchor = re.compile(r'\.|\?|!|:|/|\*').sub('', anchor)
 
         contents += indentation + headertype
         contents += '{1} <a href="#{0}" name="{0}" class="header-anchor">#</a>\n'.format(anchor, header)
@@ -100,7 +103,7 @@ def format_file(path, skip_up_to_date):
   # Write the html output.
   ensure_dir(os.path.dirname(out_path))
 
-  with open(out_path, 'w') as out:
+  with codecs.open(out_path, "w", encoding="utf-8") as out:
     out.write(page_template.format(**fields))
 
   print("converted " + path)
diff --git a/src/wren_core.c b/src/wren_core.c
index b5f63eb9..c1ea56e3 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -1078,13 +1078,7 @@ DEF_NATIVE(string_subscript)
     int index = validateIndex(vm, args, string->length, 1, "Subscript");
     if (index == -1) return PRIM_ERROR;
 
-    // The result is a one-character string.
-    // TODO: Handle UTF-8.
-    Value value = wrenNewUninitializedString(vm, 1);
-    ObjString* result = AS_STRING(value);
-    result->value[0] = AS_CSTRING(args[0])[index];
-    result->value[1] = '\0';
-    RETURN_VAL(value);
+    RETURN_VAL(wrenStringCodePointAt(vm, string, index));
   }
 
   if (!IS_RANGE(args[1]))
@@ -1092,6 +1086,7 @@ DEF_NATIVE(string_subscript)
     RETURN_ERROR("Subscript must be a number or a range.");
   }
 
+  // TODO: Handle UTF-8 here.
   int step;
   int count = string->length;
   int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
diff --git a/src/wren_value.c b/src/wren_value.c
index b1a17b82..bcab27a5 100644
--- a/src/wren_value.c
+++ b/src/wren_value.c
@@ -362,6 +362,30 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right)
   return string;
 }
 
+Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index)
+{
+  ASSERT(index >= 0, "Index out of bounds.");
+  ASSERT(index < string->length, "Index out of bounds.");
+
+  char first = string->value[index];
+
+  // The first byte's high bits tell us how many bytes are in the UTF-8
+  // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
+  // sequence, so return an empty string.
+  int numBytes;
+  if      ((first & 0xc0) == 0x80) numBytes = 0;
+  else if ((first & 0xf8) == 0xf0) numBytes = 4;
+  else if ((first & 0xf0) == 0xe0) numBytes = 3;
+  else if ((first & 0xe0) == 0xc0) numBytes = 2;
+  else numBytes = 1;
+
+  Value value = wrenNewUninitializedString(vm, numBytes);
+  ObjString* result = AS_STRING(value);
+  memcpy(result->value, string->value + index, numBytes);
+  result->value[numBytes] = '\0';
+  return value;
+}
+
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value)
 {
   Upvalue* upvalue = ALLOCATE(vm, Upvalue);
diff --git a/src/wren_value.h b/src/wren_value.h
index 359975d1..9150318e 100644
--- a/src/wren_value.h
+++ b/src/wren_value.h
@@ -610,6 +610,11 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length);
 // Creates a new string that is the concatenation of [left] and [right].
 ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right);
 
+// Creates a new string containing the code point in [string] starting at byte
+// [index]. If [index] points into the middle of a UTF-8 sequence, returns an
+// empty string.
+Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index);
+
 // Creates a new open upvalue pointing to [value] on the stack.
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value);
 
diff --git a/test/string/literals.wren b/test/string/literals.wren
index 18a6e995..870d3f89 100644
--- a/test/string/literals.wren
+++ b/test/string/literals.wren
@@ -1,2 +1,5 @@
 IO.print("".count)   // expect: 0
 IO.print("a string") // expect: a string
+
+// Non-ASCII.
+IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ
diff --git a/test/string/subscript.wren b/test/string/subscript.wren
index beea1cd9..8212e32b 100644
--- a/test/string/subscript.wren
+++ b/test/string/subscript.wren
@@ -10,5 +10,26 @@ IO.print("abcd"[-3]) // expect: b
 IO.print("abcd"[-2]) // expect: c
 IO.print("abcd"[-1]) // expect: d
 
-// Make sure the string's internal buffer size is correct.
-IO.print("abcd"[1] == "b") // expect: true
\ No newline at end of file
+// Regression: Make sure the string's internal buffer size is correct.
+IO.print("abcd"[1] == "b") // expect: true
+
+// Indexes by byte, not code point.
+//
+// Bytes:           11111
+//        012345678901234
+// Chars: sø mé ஃ  thî ng
+IO.print("søméஃthîng"[0]) // expect: s
+IO.print("søméஃthîng"[1]) // expect: ø
+IO.print("søméஃthîng"[3]) // expect: m
+IO.print("søméஃthîng"[6]) // expect: ஃ
+IO.print("søméஃthîng"[10]) // expect: h
+IO.print("søméஃthîng"[-1]) // expect: g
+IO.print("søméஃthîng"[-2]) // expect: n
+IO.print("søméஃthîng"[-4]) // expect: î
+
+// If the subscript is in the middle of a UTF-8 sequence, yield an empty string.
+IO.print("søméஃthîng"[2] == "") // expect: true
+IO.print("søméஃthîng"[7] == "") // expect: true
+IO.print("søméஃthîng"[8] == "") // expect: true
+IO.print("søméஃ"[-1] == "") // expect: true
+IO.print("søméஃ"[-2] == "") // expect: true
diff --git a/test/string/utf_8_in_literal.wren b/test/string/utf_8_in_literal.wren
deleted file mode 100644
index 7d2b805f..00000000
--- a/test/string/utf_8_in_literal.wren
+++ /dev/null
@@ -1,3 +0,0 @@
-IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ
-
-// TODO: Malformed UTF-8 source files.
\ No newline at end of file