From a5b00cebe788db5b7bda1917106ae93f1327af2a Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Thu, 22 Jan 2015 16:38:03 -0800 Subject: [PATCH] Clarify how string subscripting handles UTF-8. --- doc/site/core/string.markdown | 59 ++++++++++++++++++++++++++----- script/generate_docs.py | 13 ++++--- src/wren_core.c | 9 ++--- src/wren_value.c | 24 +++++++++++++ src/wren_value.h | 5 +++ test/string/literals.wren | 3 ++ test/string/subscript.wren | 25 +++++++++++-- test/string/utf_8_in_literal.wren | 3 -- 8 files changed, 115 insertions(+), 26 deletions(-) delete mode 100644 test/string/utf_8_in_literal.wren diff --git a/doc/site/core/string.markdown b/doc/site/core/string.markdown index 1ea51140..88328b5c 100644 --- a/doc/site/core/string.markdown +++ b/doc/site/core/string.markdown @@ -1,7 +1,37 @@ ^title String Class ^category core -A string of Unicode code points stored in UTF-8. +Strings are immutable chunks of text. More formally, a string is a sequence of +Unicode code points encoded in UTF-8. + +If you never work with any characters outside of the ASCII range, you can treat +strings like a directly indexable array of characters. Once other characters +get involved, it's important to understand the distinction. + +In UTF-8, a single Unicode code point (very roughly a single "character") may +be encoded as one or more bytes. This means you can't directly index by code +point. There's no way to find, say, the fifth code unit in a string without +walking the string from the beginning and counting them as you go. + +Because counting code units is relatively slow, string methods generally index +by *byte*, not *code unit*. When you do: + + :::dart + someString[3] + +That means "get the code unit starting at *byte* three", not "get the third +code unit in the string". This sounds scary, but keep in mind that the methods +on string *return* byte indices too. So, for example, this does what you want: + + :::dart + var metalBand = "Fäcëhämmër" + var hPosition = metalBand.indexOf("h") + IO.print(metalBand[hPosition]) // "h" + +In general, methods on strings will work in terms of code units if they can do +so efficiently, and will otherwise deal in bytes. + +## Methods ### **contains**(other) @@ -13,20 +43,20 @@ It is a runtime error if `other` is not a string. Returns the length of the string. -### **endsWith(suffix)** +### **endsWith**(suffix) Checks if the string ends with `suffix`. It is a runtime error if `suffix` is not a string. -### **indexOf(search)** +### **indexOf**(search) -Returns the index of `search` in the string or -1 if `search` is not a -substring of the string. +Returns the index of the first byte matching `search` in the string or `-1` if +`search` was not found. It is a runtime error if `search` is not a string. -### **startsWith(prefix)** +### **startsWith**(prefix) Checks if the string starts with `prefix`. @@ -48,8 +78,19 @@ Check if the string is not equal to `other`. ### **[**index**]** operator -Returns a one character string of the value at `index`. +Returns a string containing the code unit starting at byte `index`. -It is a runtime error if `index` is greater than the length of the string. + :::dart + IO.print("ʕ•ᴥ•ʔ"[5]) // "ᴥ". -*Note: This does not currently handle UTF-8 characters correctly.* +Since `ʕ` is two bytes in UTF-8 and `•` is three, the fifth byte points to the +bear's nose. + +If `index` points into the middle of a UTF-8 sequence, this returns an empty +string: + + :::dart + IO.print("I ♥ NY"[3]) // "". + +It is a runtime error if `index` is greater than the number of bytes in the +string. diff --git a/script/generate_docs.py b/script/generate_docs.py index 3c03b645..ac56c311 100755 --- a/script/generate_docs.py +++ b/script/generate_docs.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import codecs import glob import fnmatch import os @@ -12,10 +13,12 @@ from datetime import datetime import markdown -with open("doc/site/template.html") as f: + +with codecs.open("doc/site/template.html", encoding="utf-8") as f: template = f.read() -with open("doc/site/template-core.html") as f: + +with codecs.open("doc/site/template-core.html", encoding="utf-8") as f: template_core = f.read() @@ -50,7 +53,7 @@ def format_file(path, skip_up_to_date): # Read the markdown file and preprocess it. contents = "" - with open(in_path, "r") as input: + with codecs.open(in_path, "r", encoding="utf-8") as input: # Read each line, preprocessing the special codes. for line in input: stripped = line.lstrip() @@ -73,7 +76,7 @@ def format_file(path, skip_up_to_date): headertype = stripped[:index] header = stripped[index:].strip() anchor = header.lower().replace(' ', '-') - anchor = re.compile('\.|\?|!|:|/').sub('', anchor) + anchor = re.compile(r'\.|\?|!|:|/|\*').sub('', anchor) contents += indentation + headertype contents += '{1} #\n'.format(anchor, header) @@ -100,7 +103,7 @@ def format_file(path, skip_up_to_date): # Write the html output. ensure_dir(os.path.dirname(out_path)) - with open(out_path, 'w') as out: + with codecs.open(out_path, "w", encoding="utf-8") as out: out.write(page_template.format(**fields)) print("converted " + path) diff --git a/src/wren_core.c b/src/wren_core.c index b5f63eb9..c1ea56e3 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -1078,13 +1078,7 @@ DEF_NATIVE(string_subscript) int index = validateIndex(vm, args, string->length, 1, "Subscript"); if (index == -1) return PRIM_ERROR; - // The result is a one-character string. - // TODO: Handle UTF-8. - Value value = wrenNewUninitializedString(vm, 1); - ObjString* result = AS_STRING(value); - result->value[0] = AS_CSTRING(args[0])[index]; - result->value[1] = '\0'; - RETURN_VAL(value); + RETURN_VAL(wrenStringCodePointAt(vm, string, index)); } if (!IS_RANGE(args[1])) @@ -1092,6 +1086,7 @@ DEF_NATIVE(string_subscript) RETURN_ERROR("Subscript must be a number or a range."); } + // TODO: Handle UTF-8 here. int step; int count = string->length; int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step); diff --git a/src/wren_value.c b/src/wren_value.c index b1a17b82..bcab27a5 100644 --- a/src/wren_value.c +++ b/src/wren_value.c @@ -362,6 +362,30 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right) return string; } +Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index) +{ + ASSERT(index >= 0, "Index out of bounds."); + ASSERT(index < string->length, "Index out of bounds."); + + char first = string->value[index]; + + // The first byte's high bits tell us how many bytes are in the UTF-8 + // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8 + // sequence, so return an empty string. + int numBytes; + if ((first & 0xc0) == 0x80) numBytes = 0; + else if ((first & 0xf8) == 0xf0) numBytes = 4; + else if ((first & 0xf0) == 0xe0) numBytes = 3; + else if ((first & 0xe0) == 0xc0) numBytes = 2; + else numBytes = 1; + + Value value = wrenNewUninitializedString(vm, numBytes); + ObjString* result = AS_STRING(value); + memcpy(result->value, string->value + index, numBytes); + result->value[numBytes] = '\0'; + return value; +} + Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value) { Upvalue* upvalue = ALLOCATE(vm, Upvalue); diff --git a/src/wren_value.h b/src/wren_value.h index 359975d1..9150318e 100644 --- a/src/wren_value.h +++ b/src/wren_value.h @@ -610,6 +610,11 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length); // Creates a new string that is the concatenation of [left] and [right]. ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right); +// Creates a new string containing the code point in [string] starting at byte +// [index]. If [index] points into the middle of a UTF-8 sequence, returns an +// empty string. +Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index); + // Creates a new open upvalue pointing to [value] on the stack. Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value); diff --git a/test/string/literals.wren b/test/string/literals.wren index 18a6e995..870d3f89 100644 --- a/test/string/literals.wren +++ b/test/string/literals.wren @@ -1,2 +1,5 @@ IO.print("".count) // expect: 0 IO.print("a string") // expect: a string + +// Non-ASCII. +IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ diff --git a/test/string/subscript.wren b/test/string/subscript.wren index beea1cd9..8212e32b 100644 --- a/test/string/subscript.wren +++ b/test/string/subscript.wren @@ -10,5 +10,26 @@ IO.print("abcd"[-3]) // expect: b IO.print("abcd"[-2]) // expect: c IO.print("abcd"[-1]) // expect: d -// Make sure the string's internal buffer size is correct. -IO.print("abcd"[1] == "b") // expect: true \ No newline at end of file +// Regression: Make sure the string's internal buffer size is correct. +IO.print("abcd"[1] == "b") // expect: true + +// Indexes by byte, not code point. +// +// Bytes: 11111 +// 012345678901234 +// Chars: sø mé ஃ thî ng +IO.print("søméஃthîng"[0]) // expect: s +IO.print("søméஃthîng"[1]) // expect: ø +IO.print("søméஃthîng"[3]) // expect: m +IO.print("søméஃthîng"[6]) // expect: ஃ +IO.print("søméஃthîng"[10]) // expect: h +IO.print("søméஃthîng"[-1]) // expect: g +IO.print("søméஃthîng"[-2]) // expect: n +IO.print("søméஃthîng"[-4]) // expect: î + +// If the subscript is in the middle of a UTF-8 sequence, yield an empty string. +IO.print("søméஃthîng"[2] == "") // expect: true +IO.print("søméஃthîng"[7] == "") // expect: true +IO.print("søméஃthîng"[8] == "") // expect: true +IO.print("søméஃ"[-1] == "") // expect: true +IO.print("søméஃ"[-2] == "") // expect: true diff --git a/test/string/utf_8_in_literal.wren b/test/string/utf_8_in_literal.wren deleted file mode 100644 index 7d2b805f..00000000 --- a/test/string/utf_8_in_literal.wren +++ /dev/null @@ -1,3 +0,0 @@ -IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ - -// TODO: Malformed UTF-8 source files. \ No newline at end of file