Clarify how string subscripting handles UTF-8.

2026-01-11 22:28:45 +01:00 · 2015-01-22 16:38:03 -08:00
parent a92e58c804
commit a5b00cebe7
8 changed files with 115 additions and 26 deletions
--- a/doc/site/core/string.markdown
+++ b/doc/site/core/string.markdown
@ -1,7 +1,37 @@
 ^title String Class
 ^category core

-A string of Unicode code points stored in UTF-8.
+Strings are immutable chunks of text. More formally, a string is a sequence of
+Unicode code points encoded in UTF-8.
+
+If you never work with any characters outside of the ASCII range, you can treat
+strings like a directly indexable array of characters. Once other characters
+get involved, it's important to understand the distinction.
+
+In UTF-8, a single Unicode code point (very roughly a single "character") may
+be encoded as one or more bytes. This means you can't directly index by code
+point. There's no way to find, say, the fifth code unit in a string without
+walking the string from the beginning and counting them as you go.
+
+Because counting code units is relatively slow, string methods generally index
+by *byte*, not *code unit*. When you do:
+
+    :::dart
+    someString[3]
+
+That means "get the code unit starting at *byte* three", not "get the third
+code unit in the string". This sounds scary, but keep in mind that the methods
+on string *return* byte indices too. So, for example, this does what you want:
+
+    :::dart
+    var metalBand = "Fäcëhämmër"
+    var hPosition = metalBand.indexOf("h")
+    IO.print(metalBand[hPosition]) // "h"
+
+In general, methods on strings will work in terms of code units if they can do
+so efficiently, and will otherwise deal in bytes.
+
+## Methods

 ### **contains**(other)

@ -13,20 +43,20 @@ It is a runtime error if `other` is not a string.

 Returns the length of the string.

-### **endsWith(suffix)**
+### **endsWith**(suffix)

 Checks if the string ends with `suffix`.

 It is a runtime error if `suffix` is not a string.

-### **indexOf(search)**
+### **indexOf**(search)

-Returns the index of `search` in the string or -1 if `search` is not a
-substring of the string.
+Returns the index of the first byte matching `search` in the string or `-1` if
+`search` was not found.

 It is a runtime error if `search` is not a string.

-### **startsWith(prefix)**
+### **startsWith**(prefix)

 Checks if the string starts with `prefix`.

@ -48,8 +78,19 @@ Check if the string is not equal to `other`.

 ### **[**index**]** operator

-Returns a one character string of the value at `index`.
+Returns a string containing the code unit starting at byte `index`.

-It is a runtime error if `index` is greater than the length of the string.
+    :::dart
+    IO.print("ʕ•ᴥ•ʔ"[5]) // "ᴥ".

-*Note: This does not currently handle UTF-8 characters correctly.*
+Since `ʕ` is two bytes in UTF-8 and `•` is three, the fifth byte points to the
+bear's nose.
+
+If `index` points into the middle of a UTF-8 sequence, this returns an empty
+string:
+
+    :::dart
+    IO.print("I ♥ NY"[3]) // "".
+
+It is a runtime error if `index` is greater than the number of bytes in the
+string.
--- a/script/generate_docs.py
+++ b/script/generate_docs.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python

+import codecs
 import glob
 import fnmatch
 import os
@ -12,10 +13,12 @@ from datetime import datetime

 import markdown

-with open("doc/site/template.html") as f:
+
+with codecs.open("doc/site/template.html", encoding="utf-8") as f:
  template = f.read()

-with open("doc/site/template-core.html") as f:
+
+with codecs.open("doc/site/template-core.html", encoding="utf-8") as f:
  template_core = f.read()


@ -50,7 +53,7 @@ def format_file(path, skip_up_to_date):

  # Read the markdown file and preprocess it.
  contents = ""
-  with open(in_path, "r") as input:
+  with codecs.open(in_path, "r", encoding="utf-8") as input:
    # Read each line, preprocessing the special codes.
    for line in input:
      stripped = line.lstrip()
@ -73,7 +76,7 @@ def format_file(path, skip_up_to_date):
        headertype = stripped[:index]
        header = stripped[index:].strip()
        anchor = header.lower().replace(' ', '-')
-        anchor = re.compile('\.|\?|!|:|/').sub('', anchor)
+        anchor = re.compile(r'\.|\?|!|:|/|\*').sub('', anchor)

        contents += indentation + headertype
        contents += '{1} <a href="#{0}" name="{0}" class="header-anchor">#</a>\n'.format(anchor, header)
@ -100,7 +103,7 @@ def format_file(path, skip_up_to_date):
  # Write the html output.
  ensure_dir(os.path.dirname(out_path))

-  with open(out_path, 'w') as out:
+  with codecs.open(out_path, "w", encoding="utf-8") as out:
    out.write(page_template.format(**fields))

  print("converted " + path)
--- a/src/wren_core.c
+++ b/src/wren_core.c
@ -1078,13 +1078,7 @@ DEF_NATIVE(string_subscript)
    int index = validateIndex(vm, args, string->length, 1, "Subscript");
    if (index == -1) return PRIM_ERROR;

-    // The result is a one-character string.
-    // TODO: Handle UTF-8.
-    Value value = wrenNewUninitializedString(vm, 1);
-    ObjString* result = AS_STRING(value);
-    result->value[0] = AS_CSTRING(args[0])[index];
-    result->value[1] = '\0';
-    RETURN_VAL(value);
+    RETURN_VAL(wrenStringCodePointAt(vm, string, index));
  }

  if (!IS_RANGE(args[1]))
@ -1092,6 +1086,7 @@ DEF_NATIVE(string_subscript)
    RETURN_ERROR("Subscript must be a number or a range.");
  }

+  // TODO: Handle UTF-8 here.
  int step;
  int count = string->length;
  int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
--- a/src/wren_value.c
+++ b/src/wren_value.c
@ -362,6 +362,30 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right)
  return string;
 }

+Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index)
+{
+  ASSERT(index >= 0, "Index out of bounds.");
+  ASSERT(index < string->length, "Index out of bounds.");
+
+  char first = string->value[index];
+
+  // The first byte's high bits tell us how many bytes are in the UTF-8
+  // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
+  // sequence, so return an empty string.
+  int numBytes;
+  if      ((first & 0xc0) == 0x80) numBytes = 0;
+  else if ((first & 0xf8) == 0xf0) numBytes = 4;
+  else if ((first & 0xf0) == 0xe0) numBytes = 3;
+  else if ((first & 0xe0) == 0xc0) numBytes = 2;
+  else numBytes = 1;
+
+  Value value = wrenNewUninitializedString(vm, numBytes);
+  ObjString* result = AS_STRING(value);
+  memcpy(result->value, string->value + index, numBytes);
+  result->value[numBytes] = '\0';
+  return value;
+}
+
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value)
 {
  Upvalue* upvalue = ALLOCATE(vm, Upvalue);
--- a/src/wren_value.h
+++ b/src/wren_value.h
@ -610,6 +610,11 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length);
 // Creates a new string that is the concatenation of [left] and [right].
 ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right);

+// Creates a new string containing the code point in [string] starting at byte
+// [index]. If [index] points into the middle of a UTF-8 sequence, returns an
+// empty string.
+Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index);
+
 // Creates a new open upvalue pointing to [value] on the stack.
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value);

--- a/test/string/literals.wren
+++ b/test/string/literals.wren
@ -1,2 +1,5 @@
 IO.print("".count)   // expect: 0
 IO.print("a string") // expect: a string
+
+// Non-ASCII.
+IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ
--- a/test/string/subscript.wren
+++ b/test/string/subscript.wren
@ -10,5 +10,26 @@ IO.print("abcd"[-3]) // expect: b
 IO.print("abcd"[-2]) // expect: c
 IO.print("abcd"[-1]) // expect: d

-// Make sure the string's internal buffer size is correct.
-IO.print("abcd"[1] == "b") // expect: true
+// Regression: Make sure the string's internal buffer size is correct.
+IO.print("abcd"[1] == "b") // expect: true
+
+// Indexes by byte, not code point.
+//
+// Bytes:           11111
+//        012345678901234
+// Chars: sø mé ஃ  thî ng
+IO.print("søméஃthîng"[0]) // expect: s
+IO.print("søméஃthîng"[1]) // expect: ø
+IO.print("søméஃthîng"[3]) // expect: m
+IO.print("søméஃthîng"[6]) // expect: ஃ
+IO.print("søméஃthîng"[10]) // expect: h
+IO.print("søméஃthîng"[-1]) // expect: g
+IO.print("søméஃthîng"[-2]) // expect: n
+IO.print("søméஃthîng"[-4]) // expect: î
+
+// If the subscript is in the middle of a UTF-8 sequence, yield an empty string.
+IO.print("søméஃthîng"[2] == "") // expect: true
+IO.print("søméஃthîng"[7] == "") // expect: true
+IO.print("søméஃthîng"[8] == "") // expect: true
+IO.print("søméஃ"[-1] == "") // expect: true
+IO.print("søméஃ"[-2] == "") // expect: true
--- a/test/string/utf_8_in_literal.wren
+++ b/test/string/utf_8_in_literal.wren
@ -1,3 +0,0 @@
-IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ
-
-// TODO: Malformed UTF-8 source files.