Clarify how string subscripting handles UTF-8.

This commit is contained in:
Bob Nystrom
2015-01-22 16:38:03 -08:00
parent a92e58c804
commit a5b00cebe7
8 changed files with 115 additions and 26 deletions

View File

@ -1,7 +1,37 @@
^title String Class
^category core
A string of Unicode code points stored in UTF-8.
Strings are immutable chunks of text. More formally, a string is a sequence of
Unicode code points encoded in UTF-8.
If you never work with any characters outside of the ASCII range, you can treat
strings like a directly indexable array of characters. Once other characters
get involved, it's important to understand the distinction.
In UTF-8, a single Unicode code point (very roughly a single "character") may
be encoded as one or more bytes. This means you can't directly index by code
point. There's no way to find, say, the fifth code unit in a string without
walking the string from the beginning and counting them as you go.
Because counting code units is relatively slow, string methods generally index
by *byte*, not *code unit*. When you do:
:::dart
someString[3]
That means "get the code unit starting at *byte* three", not "get the third
code unit in the string". This sounds scary, but keep in mind that the methods
on string *return* byte indices too. So, for example, this does what you want:
:::dart
var metalBand = "Fäcëhämmër"
var hPosition = metalBand.indexOf("h")
IO.print(metalBand[hPosition]) // "h"
In general, methods on strings will work in terms of code units if they can do
so efficiently, and will otherwise deal in bytes.
## Methods
### **contains**(other)
@ -13,20 +43,20 @@ It is a runtime error if `other` is not a string.
Returns the length of the string.
### **endsWith(suffix)**
### **endsWith**(suffix)
Checks if the string ends with `suffix`.
It is a runtime error if `suffix` is not a string.
### **indexOf(search)**
### **indexOf**(search)
Returns the index of `search` in the string or -1 if `search` is not a
substring of the string.
Returns the index of the first byte matching `search` in the string or `-1` if
`search` was not found.
It is a runtime error if `search` is not a string.
### **startsWith(prefix)**
### **startsWith**(prefix)
Checks if the string starts with `prefix`.
@ -48,8 +78,19 @@ Check if the string is not equal to `other`.
### **[**index**]** operator
Returns a one character string of the value at `index`.
Returns a string containing the code unit starting at byte `index`.
It is a runtime error if `index` is greater than the length of the string.
:::dart
IO.print("ʕ•ᴥ•ʔ"[5]) // "ᴥ".
*Note: This does not currently handle UTF-8 characters correctly.*
Since `ʕ` is two bytes in UTF-8 and `•` is three, the fifth byte points to the
bear's nose.
If `index` points into the middle of a UTF-8 sequence, this returns an empty
string:
:::dart
IO.print("I ♥ NY"[3]) // "".
It is a runtime error if `index` is greater than the number of bytes in the
string.

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python
import codecs
import glob
import fnmatch
import os
@ -12,10 +13,12 @@ from datetime import datetime
import markdown
with open("doc/site/template.html") as f:
with codecs.open("doc/site/template.html", encoding="utf-8") as f:
template = f.read()
with open("doc/site/template-core.html") as f:
with codecs.open("doc/site/template-core.html", encoding="utf-8") as f:
template_core = f.read()
@ -50,7 +53,7 @@ def format_file(path, skip_up_to_date):
# Read the markdown file and preprocess it.
contents = ""
with open(in_path, "r") as input:
with codecs.open(in_path, "r", encoding="utf-8") as input:
# Read each line, preprocessing the special codes.
for line in input:
stripped = line.lstrip()
@ -73,7 +76,7 @@ def format_file(path, skip_up_to_date):
headertype = stripped[:index]
header = stripped[index:].strip()
anchor = header.lower().replace(' ', '-')
anchor = re.compile('\.|\?|!|:|/').sub('', anchor)
anchor = re.compile(r'\.|\?|!|:|/|\*').sub('', anchor)
contents += indentation + headertype
contents += '{1} <a href="#{0}" name="{0}" class="header-anchor">#</a>\n'.format(anchor, header)
@ -100,7 +103,7 @@ def format_file(path, skip_up_to_date):
# Write the html output.
ensure_dir(os.path.dirname(out_path))
with open(out_path, 'w') as out:
with codecs.open(out_path, "w", encoding="utf-8") as out:
out.write(page_template.format(**fields))
print("converted " + path)

View File

@ -1078,13 +1078,7 @@ DEF_NATIVE(string_subscript)
int index = validateIndex(vm, args, string->length, 1, "Subscript");
if (index == -1) return PRIM_ERROR;
// The result is a one-character string.
// TODO: Handle UTF-8.
Value value = wrenNewUninitializedString(vm, 1);
ObjString* result = AS_STRING(value);
result->value[0] = AS_CSTRING(args[0])[index];
result->value[1] = '\0';
RETURN_VAL(value);
RETURN_VAL(wrenStringCodePointAt(vm, string, index));
}
if (!IS_RANGE(args[1]))
@ -1092,6 +1086,7 @@ DEF_NATIVE(string_subscript)
RETURN_ERROR("Subscript must be a number or a range.");
}
// TODO: Handle UTF-8 here.
int step;
int count = string->length;
int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);

View File

@ -362,6 +362,30 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right)
return string;
}
Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index)
{
ASSERT(index >= 0, "Index out of bounds.");
ASSERT(index < string->length, "Index out of bounds.");
char first = string->value[index];
// The first byte's high bits tell us how many bytes are in the UTF-8
// sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
// sequence, so return an empty string.
int numBytes;
if ((first & 0xc0) == 0x80) numBytes = 0;
else if ((first & 0xf8) == 0xf0) numBytes = 4;
else if ((first & 0xf0) == 0xe0) numBytes = 3;
else if ((first & 0xe0) == 0xc0) numBytes = 2;
else numBytes = 1;
Value value = wrenNewUninitializedString(vm, numBytes);
ObjString* result = AS_STRING(value);
memcpy(result->value, string->value + index, numBytes);
result->value[numBytes] = '\0';
return value;
}
Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value)
{
Upvalue* upvalue = ALLOCATE(vm, Upvalue);

View File

@ -610,6 +610,11 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length);
// Creates a new string that is the concatenation of [left] and [right].
ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right);
// Creates a new string containing the code point in [string] starting at byte
// [index]. If [index] points into the middle of a UTF-8 sequence, returns an
// empty string.
Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index);
// Creates a new open upvalue pointing to [value] on the stack.
Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value);

View File

@ -1,2 +1,5 @@
IO.print("".count) // expect: 0
IO.print("a string") // expect: a string
// Non-ASCII.
IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ

View File

@ -10,5 +10,26 @@ IO.print("abcd"[-3]) // expect: b
IO.print("abcd"[-2]) // expect: c
IO.print("abcd"[-1]) // expect: d
// Make sure the string's internal buffer size is correct.
IO.print("abcd"[1] == "b") // expect: true
// Regression: Make sure the string's internal buffer size is correct.
IO.print("abcd"[1] == "b") // expect: true
// Indexes by byte, not code point.
//
// Bytes: 11111
// 012345678901234
// Chars: sø mé ஃ thî ng
IO.print("søméஃthîng"[0]) // expect: s
IO.print("søméஃthîng"[1]) // expect: ø
IO.print("søméஃthîng"[3]) // expect: m
IO.print("søméஃthîng"[6]) // expect: ஃ
IO.print("søméஃthîng"[10]) // expect: h
IO.print("søméஃthîng"[-1]) // expect: g
IO.print("søméஃthîng"[-2]) // expect: n
IO.print("søméஃthîng"[-4]) // expect: î
// If the subscript is in the middle of a UTF-8 sequence, yield an empty string.
IO.print("søméஃthîng"[2] == "") // expect: true
IO.print("søméஃthîng"[7] == "") // expect: true
IO.print("søméஃthîng"[8] == "") // expect: true
IO.print("søméஃ"[-1] == "") // expect: true
IO.print("søméஃ"[-2] == "") // expect: true

View File

@ -1,3 +0,0 @@
IO.print("A~¶Þॐஃ") // expect: A~¶Þॐஃ
// TODO: Malformed UTF-8 source files.