From 783a5b750aa610670114fe2f8f00682b6a50a0f0 Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Tue, 1 Sep 2015 22:14:55 -0700 Subject: [PATCH] Get ranges working in string subscripts (again). Now with UTF-8 hotness! --- src/vm/wren_compiler.c | 2 +- src/vm/wren_core.c | 17 ++----- src/vm/wren_utils.c | 32 +++++++++--- src/vm/wren_utils.h | 19 +++++-- src/vm/wren_value.c | 49 +++++++++++++------ src/vm/wren_value.h | 6 +++ test/core/string/subscript_range.wren | 14 +++++- .../string/subscript_range_from_not_int.wren | 1 - .../subscript_range_from_too_large.wren | 1 - .../subscript_range_from_too_small.wren | 1 - ...ubscript_range_to_exclusive_too_large.wren | 1 - ...ubscript_range_to_exclusive_too_small.wren | 1 - .../string/subscript_range_to_not_int.wren | 1 - .../string/subscript_range_to_too_large.wren | 1 - .../string/subscript_range_to_too_small.wren | 1 - 15 files changed, 96 insertions(+), 51 deletions(-) diff --git a/src/vm/wren_compiler.c b/src/vm/wren_compiler.c index 49c9bd94..d4165773 100644 --- a/src/vm/wren_compiler.c +++ b/src/vm/wren_compiler.c @@ -697,7 +697,7 @@ static void readUnicodeEscape(Parser* parser, ByteBuffer* string) int value = readHexEscape(parser, 4, "Unicode"); // Grow the buffer enough for the encoded result. - int numBytes = wrenUtf8NumBytes(value); + int numBytes = wrenUtf8EncodeNumBytes(value); if (numBytes != 0) { wrenByteBufferFill(parser->vm, string, 0, numBytes); diff --git a/src/vm/wren_core.c b/src/vm/wren_core.c index fe1b930d..8484ae6c 100644 --- a/src/vm/wren_core.c +++ b/src/vm/wren_core.c @@ -604,7 +604,7 @@ DEF_PRIMITIVE(list_subscript) ObjList* result = wrenNewList(vm, count); for (uint32_t i = 0; i < count; i++) { - result->elements.data[i] = list->elements.data[start + (i * step)]; + result->elements.data[i] = list->elements.data[start + i * step]; } RETURN_OBJ(result); @@ -1229,23 +1229,12 @@ DEF_PRIMITIVE(string_subscript) RETURN_ERROR("Subscript must be a number or a range."); } - // TODO: Handle UTF-8 here. - /* int step; - int count = string->length; + uint32_t count = string->length; int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step); if (start == -1) return PRIM_ERROR; - ObjString* result = wrenNewUninitializedString(vm, count); - for (int i = 0; i < count; i++) - { - result->value[i] = string->value[start + (i * step)]; - } - result->value[count] = '\0'; - - RETURN_OBJ(result); - */ - RETURN_ERROR("Subscript ranges for strings are not implemented yet."); + RETURN_VAL(wrenNewStringFromRange(vm, string, start, count, step)); } DEF_PRIMITIVE(string_toString) diff --git a/src/vm/wren_utils.c b/src/vm/wren_utils.c index 32a2b3a7..b5c31888 100644 --- a/src/vm/wren_utils.c +++ b/src/vm/wren_utils.c @@ -59,7 +59,7 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length) return -1; } -int wrenUtf8NumBytes(int value) +int wrenUtf8EncodeNumBytes(int value) { ASSERT(value >= 0, "Cannot encode a negative value."); @@ -70,12 +70,13 @@ int wrenUtf8NumBytes(int value) return 0; } -void wrenUtf8Encode(int value, uint8_t* bytes) +int wrenUtf8Encode(int value, uint8_t* bytes) { if (value <= 0x7f) { // Single byte (i.e. fits in ASCII). *bytes = value & 0x7f; + return 1; } else if (value <= 0x7ff) { @@ -83,6 +84,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes) *bytes = 0xc0 | ((value & 0x7c0) >> 6); bytes++; *bytes = 0x80 | (value & 0x3f); + return 2; } else if (value <= 0xffff) { @@ -92,6 +94,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes) *bytes = 0x80 | ((value & 0xfc0) >> 6); bytes++; *bytes = 0x80 | (value & 0x3f); + return 3; } else if (value <= 0x10ffff) { @@ -103,12 +106,11 @@ void wrenUtf8Encode(int value, uint8_t* bytes) *bytes = 0x80 | ((value & 0xfc0) >> 6); bytes++; *bytes = 0x80 | (value & 0x3f); + return 4; } - else - { - // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629 - ASSERT(false, "Invalid UTF-8 value."); - } + + // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629 + UNREACHABLE(); } int wrenUtf8Decode(const uint8_t* bytes, uint32_t length) @@ -158,3 +160,19 @@ int wrenUtf8Decode(const uint8_t* bytes, uint32_t length) return value; } + +int wrenUtf8DecodeNumBytes(const char* string, uint32_t index) +{ + char first = string[index]; + + // If the byte starts with 10xxxxx, it's the middle of a UTF-8 sequence, so + // don't count it at all. + if ((first & 0xc0) == 0x80) return 0; + + // The first byte's high bits tell us how many bytes are in the UTF-8 + // sequence. + if ((first & 0xf8) == 0xf0) return 4; + if ((first & 0xf0) == 0xe0) return 3; + if ((first & 0xe0) == 0xc0) return 2; + return 1; +} diff --git a/src/vm/wren_utils.h b/src/vm/wren_utils.h index f81063f3..469162c1 100644 --- a/src/vm/wren_utils.h +++ b/src/vm/wren_utils.h @@ -100,14 +100,25 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length); // Returns the number of bytes needed to encode [value] in UTF-8. // // Returns 0 if [value] is too large to encode. -int wrenUtf8NumBytes(int value); +int wrenUtf8EncodeNumBytes(int value); // Encodes value as a series of bytes in [bytes], which is assumed to be large // enough to hold the encoded result. -void wrenUtf8Encode(int value, uint8_t* bytes); +// +// Returns the number of written bytes. +int wrenUtf8Encode(int value, uint8_t* bytes); -// Decodes the UTF-8 sequence in [bytes] (which has max [length]), returning -// the code point. +// Decodes the UTF-8 sequence starting at [bytes] (which has max [length]), +// returning the code point. +// +// Returns -1 if the bytes are not a valid UTF-8 sequence. int wrenUtf8Decode(const uint8_t* bytes, uint32_t length); +// Returns the number of bytes in the UTF-8 sequence starting at [index] in +// [string]. +// +// If the character at that index is not the beginning of a UTF-8 sequence, +// returns 0. +int wrenUtf8DecodeNumBytes(const char* string, uint32_t index); + #endif diff --git a/src/vm/wren_value.c b/src/vm/wren_value.c index e4b75cf7..dec3c080 100644 --- a/src/vm/wren_value.c +++ b/src/vm/wren_value.c @@ -625,13 +625,41 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length) ObjString* string = allocateString(vm, length); // Copy the string (if given one). - if (length > 0) memcpy(string->value, text, length); + if (length > 0 && text != NULL) memcpy(string->value, text, length); hashString(string); - return OBJ_VAL(string); } +Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start, + uint32_t count, int step) +{ + uint8_t* from = (uint8_t*)source->value; + int length = 0; + for (uint32_t i = 0; i < count; i++) + { + length += wrenUtf8EncodeNumBytes(from[start + i * step]); + } + + ObjString* result = allocateString(vm, length); + result->value[length] = '\0'; + + uint8_t* to = (uint8_t*)result->value; + for (uint32_t i = 0; i < count; i++) + { + int index = start + i * step; + int codePoint = wrenUtf8Decode(from + index, source->length - index); + + if (codePoint != -1) + { + to += wrenUtf8Encode(codePoint, to); + } + } + + hashString(result); + return OBJ_VAL(result); +} + Value wrenNumToString(WrenVM* vm, double value) { // Corner case: If the value is NaN, different versions of libc produce @@ -664,7 +692,7 @@ Value wrenNumToString(WrenVM* vm, double value) Value wrenStringFromCodePoint(WrenVM* vm, int value) { - int length = wrenUtf8NumBytes(value); + int length = wrenUtf8EncodeNumBytes(value); ASSERT(length != 0, "Value out of range."); ObjString* string = allocateString(vm, length); @@ -743,19 +771,8 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...) Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index) { ASSERT(index < string->length, "Index out of bounds."); - - char first = string->value[index]; - - // The first byte's high bits tell us how many bytes are in the UTF-8 - // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8 - // sequence, so return an empty string. - int numBytes; - if ((first & 0xc0) == 0x80) numBytes = 0; - else if ((first & 0xf8) == 0xf0) numBytes = 4; - else if ((first & 0xf0) == 0xe0) numBytes = 3; - else if ((first & 0xe0) == 0xc0) numBytes = 2; - else numBytes = 1; - + + int numBytes = wrenUtf8DecodeNumBytes(string->value, index); return wrenNewString(vm, string->value + index, numBytes); } diff --git a/src/vm/wren_value.h b/src/vm/wren_value.h index f453bcd7..442180d2 100644 --- a/src/vm/wren_value.h +++ b/src/vm/wren_value.h @@ -715,6 +715,12 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive); // [text] may be NULL if [length] is zero. Value wrenNewString(WrenVM* vm, const char* text, size_t length); +// Creates a new string object by taking a range of characters from [source]. +// The range starts at [start], contains [count] bytes, and increments by +// [step]. +Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start, + uint32_t count, int step); + // Produces a string representation of [value]. Value wrenNumToString(WrenVM* vm, double value); diff --git a/test/core/string/subscript_range.wren b/test/core/string/subscript_range.wren index 2aa824a2..d78359b8 100644 --- a/test/core/string/subscript_range.wren +++ b/test/core/string/subscript_range.wren @@ -1,4 +1,3 @@ -// skip: Range subscripts for strings don't handle UTF-8. var string = "abcde" IO.print(string[0..0]) // expect: a IO.print(string[1...1] == "") // expect: true @@ -33,3 +32,16 @@ IO.print(string[3...-6]) // expect: dcba // An empty range at zero is allowed on an empty string. IO.print(""[0...0] == "") // expect: true IO.print(""[0..-1] == "") // expect: true + +// Indexes by byte, not code point. +// +// Bytes: 11111 +// 012345678901234 +// Chars: sø mé ஃ thî ng +IO.print("søméஃthîng"[0..3]) // expect: søm +IO.print("søméஃthîng"[3...10]) // expect: méஃt + +// Only includes sequences whose first byte is in the range. +IO.print("søméஃthîng"[2..6]) // expect: méஃ +IO.print("søméஃthîng"[2...6]) // expect: mé +IO.print("søméஃthîng"[2...7]) // expect: méஃ diff --git a/test/core/string/subscript_range_from_not_int.wren b/test/core/string/subscript_range_from_not_int.wren index f397f5c9..1c46f0fe 100644 --- a/test/core/string/subscript_range_from_not_int.wren +++ b/test/core/string/subscript_range_from_not_int.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "string" a[1.5..2] // expect runtime error: Range start must be an integer. diff --git a/test/core/string/subscript_range_from_too_large.wren b/test/core/string/subscript_range_from_too_large.wren index e91bed09..e0240d6a 100644 --- a/test/core/string/subscript_range_from_too_large.wren +++ b/test/core/string/subscript_range_from_too_large.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[3..2] // expect runtime error: Range start out of bounds. diff --git a/test/core/string/subscript_range_from_too_small.wren b/test/core/string/subscript_range_from_too_small.wren index b0460ef1..3daec49c 100644 --- a/test/core/string/subscript_range_from_too_small.wren +++ b/test/core/string/subscript_range_from_too_small.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[-4..2] // expect runtime error: Range start out of bounds. diff --git a/test/core/string/subscript_range_to_exclusive_too_large.wren b/test/core/string/subscript_range_to_exclusive_too_large.wren index cc5307cf..3f34520b 100644 --- a/test/core/string/subscript_range_to_exclusive_too_large.wren +++ b/test/core/string/subscript_range_to_exclusive_too_large.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[1...4] // expect runtime error: Range end out of bounds. diff --git a/test/core/string/subscript_range_to_exclusive_too_small.wren b/test/core/string/subscript_range_to_exclusive_too_small.wren index 747a7313..2217e817 100644 --- a/test/core/string/subscript_range_to_exclusive_too_small.wren +++ b/test/core/string/subscript_range_to_exclusive_too_small.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[0...-5] // expect runtime error: Range end out of bounds. diff --git a/test/core/string/subscript_range_to_not_int.wren b/test/core/string/subscript_range_to_not_int.wren index ca2d12f4..059b2907 100644 --- a/test/core/string/subscript_range_to_not_int.wren +++ b/test/core/string/subscript_range_to_not_int.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "string" a[1..2.5] // expect runtime error: Range end must be an integer. diff --git a/test/core/string/subscript_range_to_too_large.wren b/test/core/string/subscript_range_to_too_large.wren index 701e3995..d0cbaaca 100644 --- a/test/core/string/subscript_range_to_too_large.wren +++ b/test/core/string/subscript_range_to_too_large.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[1..3] // expect runtime error: Range end out of bounds. diff --git a/test/core/string/subscript_range_to_too_small.wren b/test/core/string/subscript_range_to_too_small.wren index 9dcc5240..71f241a4 100644 --- a/test/core/string/subscript_range_to_too_small.wren +++ b/test/core/string/subscript_range_to_too_small.wren @@ -1,3 +1,2 @@ -// skip: Range subscripts for strings don't handle UTF-8. var a = "123" a[0..-4] // expect runtime error: Range end out of bounds.