Get ranges working in string subscripts (again).

Now with UTF-8 hotness!
This commit is contained in:
Bob Nystrom
2015-09-01 22:14:55 -07:00
parent 2e83f056c1
commit 783a5b750a
15 changed files with 96 additions and 51 deletions

View File

@ -697,7 +697,7 @@ static void readUnicodeEscape(Parser* parser, ByteBuffer* string)
int value = readHexEscape(parser, 4, "Unicode");
// Grow the buffer enough for the encoded result.
int numBytes = wrenUtf8NumBytes(value);
int numBytes = wrenUtf8EncodeNumBytes(value);
if (numBytes != 0)
{
wrenByteBufferFill(parser->vm, string, 0, numBytes);

View File

@ -604,7 +604,7 @@ DEF_PRIMITIVE(list_subscript)
ObjList* result = wrenNewList(vm, count);
for (uint32_t i = 0; i < count; i++)
{
result->elements.data[i] = list->elements.data[start + (i * step)];
result->elements.data[i] = list->elements.data[start + i * step];
}
RETURN_OBJ(result);
@ -1229,23 +1229,12 @@ DEF_PRIMITIVE(string_subscript)
RETURN_ERROR("Subscript must be a number or a range.");
}
// TODO: Handle UTF-8 here.
/*
int step;
int count = string->length;
uint32_t count = string->length;
int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
if (start == -1) return PRIM_ERROR;
ObjString* result = wrenNewUninitializedString(vm, count);
for (int i = 0; i < count; i++)
{
result->value[i] = string->value[start + (i * step)];
}
result->value[count] = '\0';
RETURN_OBJ(result);
*/
RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
RETURN_VAL(wrenNewStringFromRange(vm, string, start, count, step));
}
DEF_PRIMITIVE(string_toString)

View File

@ -59,7 +59,7 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length)
return -1;
}
int wrenUtf8NumBytes(int value)
int wrenUtf8EncodeNumBytes(int value)
{
ASSERT(value >= 0, "Cannot encode a negative value.");
@ -70,12 +70,13 @@ int wrenUtf8NumBytes(int value)
return 0;
}
void wrenUtf8Encode(int value, uint8_t* bytes)
int wrenUtf8Encode(int value, uint8_t* bytes)
{
if (value <= 0x7f)
{
// Single byte (i.e. fits in ASCII).
*bytes = value & 0x7f;
return 1;
}
else if (value <= 0x7ff)
{
@ -83,6 +84,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
*bytes = 0xc0 | ((value & 0x7c0) >> 6);
bytes++;
*bytes = 0x80 | (value & 0x3f);
return 2;
}
else if (value <= 0xffff)
{
@ -92,6 +94,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
*bytes = 0x80 | ((value & 0xfc0) >> 6);
bytes++;
*bytes = 0x80 | (value & 0x3f);
return 3;
}
else if (value <= 0x10ffff)
{
@ -103,12 +106,11 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
*bytes = 0x80 | ((value & 0xfc0) >> 6);
bytes++;
*bytes = 0x80 | (value & 0x3f);
return 4;
}
else
{
// Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
ASSERT(false, "Invalid UTF-8 value.");
}
UNREACHABLE();
}
int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
@ -158,3 +160,19 @@ int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
return value;
}
int wrenUtf8DecodeNumBytes(const char* string, uint32_t index)
{
char first = string[index];
// If the byte starts with 10xxxxx, it's the middle of a UTF-8 sequence, so
// don't count it at all.
if ((first & 0xc0) == 0x80) return 0;
// The first byte's high bits tell us how many bytes are in the UTF-8
// sequence.
if ((first & 0xf8) == 0xf0) return 4;
if ((first & 0xf0) == 0xe0) return 3;
if ((first & 0xe0) == 0xc0) return 2;
return 1;
}

View File

@ -100,14 +100,25 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length);
// Returns the number of bytes needed to encode [value] in UTF-8.
//
// Returns 0 if [value] is too large to encode.
int wrenUtf8NumBytes(int value);
int wrenUtf8EncodeNumBytes(int value);
// Encodes value as a series of bytes in [bytes], which is assumed to be large
// enough to hold the encoded result.
void wrenUtf8Encode(int value, uint8_t* bytes);
//
// Returns the number of written bytes.
int wrenUtf8Encode(int value, uint8_t* bytes);
// Decodes the UTF-8 sequence in [bytes] (which has max [length]), returning
// the code point.
// Decodes the UTF-8 sequence starting at [bytes] (which has max [length]),
// returning the code point.
//
// Returns -1 if the bytes are not a valid UTF-8 sequence.
int wrenUtf8Decode(const uint8_t* bytes, uint32_t length);
// Returns the number of bytes in the UTF-8 sequence starting at [index] in
// [string].
//
// If the character at that index is not the beginning of a UTF-8 sequence,
// returns 0.
int wrenUtf8DecodeNumBytes(const char* string, uint32_t index);
#endif

View File

@ -625,13 +625,41 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length)
ObjString* string = allocateString(vm, length);
// Copy the string (if given one).
if (length > 0) memcpy(string->value, text, length);
if (length > 0 && text != NULL) memcpy(string->value, text, length);
hashString(string);
return OBJ_VAL(string);
}
Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
uint32_t count, int step)
{
uint8_t* from = (uint8_t*)source->value;
int length = 0;
for (uint32_t i = 0; i < count; i++)
{
length += wrenUtf8EncodeNumBytes(from[start + i * step]);
}
ObjString* result = allocateString(vm, length);
result->value[length] = '\0';
uint8_t* to = (uint8_t*)result->value;
for (uint32_t i = 0; i < count; i++)
{
int index = start + i * step;
int codePoint = wrenUtf8Decode(from + index, source->length - index);
if (codePoint != -1)
{
to += wrenUtf8Encode(codePoint, to);
}
}
hashString(result);
return OBJ_VAL(result);
}
Value wrenNumToString(WrenVM* vm, double value)
{
// Corner case: If the value is NaN, different versions of libc produce
@ -664,7 +692,7 @@ Value wrenNumToString(WrenVM* vm, double value)
Value wrenStringFromCodePoint(WrenVM* vm, int value)
{
int length = wrenUtf8NumBytes(value);
int length = wrenUtf8EncodeNumBytes(value);
ASSERT(length != 0, "Value out of range.");
ObjString* string = allocateString(vm, length);
@ -744,18 +772,7 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
{
ASSERT(index < string->length, "Index out of bounds.");
char first = string->value[index];
// The first byte's high bits tell us how many bytes are in the UTF-8
// sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
// sequence, so return an empty string.
int numBytes;
if ((first & 0xc0) == 0x80) numBytes = 0;
else if ((first & 0xf8) == 0xf0) numBytes = 4;
else if ((first & 0xf0) == 0xe0) numBytes = 3;
else if ((first & 0xe0) == 0xc0) numBytes = 2;
else numBytes = 1;
int numBytes = wrenUtf8DecodeNumBytes(string->value, index);
return wrenNewString(vm, string->value + index, numBytes);
}

View File

@ -715,6 +715,12 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
// [text] may be NULL if [length] is zero.
Value wrenNewString(WrenVM* vm, const char* text, size_t length);
// Creates a new string object by taking a range of characters from [source].
// The range starts at [start], contains [count] bytes, and increments by
// [step].
Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
uint32_t count, int step);
// Produces a string representation of [value].
Value wrenNumToString(WrenVM* vm, double value);

View File

@ -1,4 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var string = "abcde"
IO.print(string[0..0]) // expect: a
IO.print(string[1...1] == "") // expect: true
@ -33,3 +32,16 @@ IO.print(string[3...-6]) // expect: dcba
// An empty range at zero is allowed on an empty string.
IO.print(""[0...0] == "") // expect: true
IO.print(""[0..-1] == "") // expect: true
// Indexes by byte, not code point.
//
// Bytes: 11111
// 012345678901234
// Chars: sø mé ஃ thî ng
IO.print("søméஃthîng"[0..3]) // expect: søm
IO.print("søméஃthîng"[3...10]) // expect: méஃt
// Only includes sequences whose first byte is in the range.
IO.print("søméஃthîng"[2..6]) // expect: méஃ
IO.print("søméஃthîng"[2...6]) // expect: mé
IO.print("søméஃthîng"[2...7]) // expect: méஃ

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "string"
a[1.5..2] // expect runtime error: Range start must be an integer.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[3..2] // expect runtime error: Range start out of bounds.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[-4..2] // expect runtime error: Range start out of bounds.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[1...4] // expect runtime error: Range end out of bounds.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[0...-5] // expect runtime error: Range end out of bounds.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "string"
a[1..2.5] // expect runtime error: Range end must be an integer.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[1..3] // expect runtime error: Range end out of bounds.

View File

@ -1,3 +1,2 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[0..-4] // expect runtime error: Range end out of bounds.