mirror of
https://github.com/wren-lang/wren.git
synced 2026-01-11 06:08:41 +01:00
Get ranges working in string subscripts (again).
Now with UTF-8 hotness!
This commit is contained in:
@ -697,7 +697,7 @@ static void readUnicodeEscape(Parser* parser, ByteBuffer* string)
|
||||
int value = readHexEscape(parser, 4, "Unicode");
|
||||
|
||||
// Grow the buffer enough for the encoded result.
|
||||
int numBytes = wrenUtf8NumBytes(value);
|
||||
int numBytes = wrenUtf8EncodeNumBytes(value);
|
||||
if (numBytes != 0)
|
||||
{
|
||||
wrenByteBufferFill(parser->vm, string, 0, numBytes);
|
||||
|
||||
@ -604,7 +604,7 @@ DEF_PRIMITIVE(list_subscript)
|
||||
ObjList* result = wrenNewList(vm, count);
|
||||
for (uint32_t i = 0; i < count; i++)
|
||||
{
|
||||
result->elements.data[i] = list->elements.data[start + (i * step)];
|
||||
result->elements.data[i] = list->elements.data[start + i * step];
|
||||
}
|
||||
|
||||
RETURN_OBJ(result);
|
||||
@ -1229,23 +1229,12 @@ DEF_PRIMITIVE(string_subscript)
|
||||
RETURN_ERROR("Subscript must be a number or a range.");
|
||||
}
|
||||
|
||||
// TODO: Handle UTF-8 here.
|
||||
/*
|
||||
int step;
|
||||
int count = string->length;
|
||||
uint32_t count = string->length;
|
||||
int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
|
||||
if (start == -1) return PRIM_ERROR;
|
||||
|
||||
ObjString* result = wrenNewUninitializedString(vm, count);
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
result->value[i] = string->value[start + (i * step)];
|
||||
}
|
||||
result->value[count] = '\0';
|
||||
|
||||
RETURN_OBJ(result);
|
||||
*/
|
||||
RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
|
||||
RETURN_VAL(wrenNewStringFromRange(vm, string, start, count, step));
|
||||
}
|
||||
|
||||
DEF_PRIMITIVE(string_toString)
|
||||
|
||||
@ -59,7 +59,7 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length)
|
||||
return -1;
|
||||
}
|
||||
|
||||
int wrenUtf8NumBytes(int value)
|
||||
int wrenUtf8EncodeNumBytes(int value)
|
||||
{
|
||||
ASSERT(value >= 0, "Cannot encode a negative value.");
|
||||
|
||||
@ -70,12 +70,13 @@ int wrenUtf8NumBytes(int value)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void wrenUtf8Encode(int value, uint8_t* bytes)
|
||||
int wrenUtf8Encode(int value, uint8_t* bytes)
|
||||
{
|
||||
if (value <= 0x7f)
|
||||
{
|
||||
// Single byte (i.e. fits in ASCII).
|
||||
*bytes = value & 0x7f;
|
||||
return 1;
|
||||
}
|
||||
else if (value <= 0x7ff)
|
||||
{
|
||||
@ -83,6 +84,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
|
||||
*bytes = 0xc0 | ((value & 0x7c0) >> 6);
|
||||
bytes++;
|
||||
*bytes = 0x80 | (value & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
else if (value <= 0xffff)
|
||||
{
|
||||
@ -92,6 +94,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
|
||||
*bytes = 0x80 | ((value & 0xfc0) >> 6);
|
||||
bytes++;
|
||||
*bytes = 0x80 | (value & 0x3f);
|
||||
return 3;
|
||||
}
|
||||
else if (value <= 0x10ffff)
|
||||
{
|
||||
@ -103,12 +106,11 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
|
||||
*bytes = 0x80 | ((value & 0xfc0) >> 6);
|
||||
bytes++;
|
||||
*bytes = 0x80 | (value & 0x3f);
|
||||
return 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
|
||||
ASSERT(false, "Invalid UTF-8 value.");
|
||||
}
|
||||
|
||||
// Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
|
||||
@ -158,3 +160,19 @@ int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
int wrenUtf8DecodeNumBytes(const char* string, uint32_t index)
|
||||
{
|
||||
char first = string[index];
|
||||
|
||||
// If the byte starts with 10xxxxx, it's the middle of a UTF-8 sequence, so
|
||||
// don't count it at all.
|
||||
if ((first & 0xc0) == 0x80) return 0;
|
||||
|
||||
// The first byte's high bits tell us how many bytes are in the UTF-8
|
||||
// sequence.
|
||||
if ((first & 0xf8) == 0xf0) return 4;
|
||||
if ((first & 0xf0) == 0xe0) return 3;
|
||||
if ((first & 0xe0) == 0xc0) return 2;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -100,14 +100,25 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length);
|
||||
// Returns the number of bytes needed to encode [value] in UTF-8.
|
||||
//
|
||||
// Returns 0 if [value] is too large to encode.
|
||||
int wrenUtf8NumBytes(int value);
|
||||
int wrenUtf8EncodeNumBytes(int value);
|
||||
|
||||
// Encodes value as a series of bytes in [bytes], which is assumed to be large
|
||||
// enough to hold the encoded result.
|
||||
void wrenUtf8Encode(int value, uint8_t* bytes);
|
||||
//
|
||||
// Returns the number of written bytes.
|
||||
int wrenUtf8Encode(int value, uint8_t* bytes);
|
||||
|
||||
// Decodes the UTF-8 sequence in [bytes] (which has max [length]), returning
|
||||
// the code point.
|
||||
// Decodes the UTF-8 sequence starting at [bytes] (which has max [length]),
|
||||
// returning the code point.
|
||||
//
|
||||
// Returns -1 if the bytes are not a valid UTF-8 sequence.
|
||||
int wrenUtf8Decode(const uint8_t* bytes, uint32_t length);
|
||||
|
||||
// Returns the number of bytes in the UTF-8 sequence starting at [index] in
|
||||
// [string].
|
||||
//
|
||||
// If the character at that index is not the beginning of a UTF-8 sequence,
|
||||
// returns 0.
|
||||
int wrenUtf8DecodeNumBytes(const char* string, uint32_t index);
|
||||
|
||||
#endif
|
||||
|
||||
@ -625,13 +625,41 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length)
|
||||
ObjString* string = allocateString(vm, length);
|
||||
|
||||
// Copy the string (if given one).
|
||||
if (length > 0) memcpy(string->value, text, length);
|
||||
if (length > 0 && text != NULL) memcpy(string->value, text, length);
|
||||
|
||||
hashString(string);
|
||||
|
||||
return OBJ_VAL(string);
|
||||
}
|
||||
|
||||
Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
|
||||
uint32_t count, int step)
|
||||
{
|
||||
uint8_t* from = (uint8_t*)source->value;
|
||||
int length = 0;
|
||||
for (uint32_t i = 0; i < count; i++)
|
||||
{
|
||||
length += wrenUtf8EncodeNumBytes(from[start + i * step]);
|
||||
}
|
||||
|
||||
ObjString* result = allocateString(vm, length);
|
||||
result->value[length] = '\0';
|
||||
|
||||
uint8_t* to = (uint8_t*)result->value;
|
||||
for (uint32_t i = 0; i < count; i++)
|
||||
{
|
||||
int index = start + i * step;
|
||||
int codePoint = wrenUtf8Decode(from + index, source->length - index);
|
||||
|
||||
if (codePoint != -1)
|
||||
{
|
||||
to += wrenUtf8Encode(codePoint, to);
|
||||
}
|
||||
}
|
||||
|
||||
hashString(result);
|
||||
return OBJ_VAL(result);
|
||||
}
|
||||
|
||||
Value wrenNumToString(WrenVM* vm, double value)
|
||||
{
|
||||
// Corner case: If the value is NaN, different versions of libc produce
|
||||
@ -664,7 +692,7 @@ Value wrenNumToString(WrenVM* vm, double value)
|
||||
|
||||
Value wrenStringFromCodePoint(WrenVM* vm, int value)
|
||||
{
|
||||
int length = wrenUtf8NumBytes(value);
|
||||
int length = wrenUtf8EncodeNumBytes(value);
|
||||
ASSERT(length != 0, "Value out of range.");
|
||||
|
||||
ObjString* string = allocateString(vm, length);
|
||||
@ -743,19 +771,8 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
|
||||
Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
|
||||
{
|
||||
ASSERT(index < string->length, "Index out of bounds.");
|
||||
|
||||
char first = string->value[index];
|
||||
|
||||
// The first byte's high bits tell us how many bytes are in the UTF-8
|
||||
// sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
|
||||
// sequence, so return an empty string.
|
||||
int numBytes;
|
||||
if ((first & 0xc0) == 0x80) numBytes = 0;
|
||||
else if ((first & 0xf8) == 0xf0) numBytes = 4;
|
||||
else if ((first & 0xf0) == 0xe0) numBytes = 3;
|
||||
else if ((first & 0xe0) == 0xc0) numBytes = 2;
|
||||
else numBytes = 1;
|
||||
|
||||
|
||||
int numBytes = wrenUtf8DecodeNumBytes(string->value, index);
|
||||
return wrenNewString(vm, string->value + index, numBytes);
|
||||
}
|
||||
|
||||
|
||||
@ -715,6 +715,12 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
|
||||
// [text] may be NULL if [length] is zero.
|
||||
Value wrenNewString(WrenVM* vm, const char* text, size_t length);
|
||||
|
||||
// Creates a new string object by taking a range of characters from [source].
|
||||
// The range starts at [start], contains [count] bytes, and increments by
|
||||
// [step].
|
||||
Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
|
||||
uint32_t count, int step);
|
||||
|
||||
// Produces a string representation of [value].
|
||||
Value wrenNumToString(WrenVM* vm, double value);
|
||||
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var string = "abcde"
|
||||
IO.print(string[0..0]) // expect: a
|
||||
IO.print(string[1...1] == "") // expect: true
|
||||
@ -33,3 +32,16 @@ IO.print(string[3...-6]) // expect: dcba
|
||||
// An empty range at zero is allowed on an empty string.
|
||||
IO.print(""[0...0] == "") // expect: true
|
||||
IO.print(""[0..-1] == "") // expect: true
|
||||
|
||||
// Indexes by byte, not code point.
|
||||
//
|
||||
// Bytes: 11111
|
||||
// 012345678901234
|
||||
// Chars: sø mé ஃ thî ng
|
||||
IO.print("søméஃthîng"[0..3]) // expect: søm
|
||||
IO.print("søméஃthîng"[3...10]) // expect: méஃt
|
||||
|
||||
// Only includes sequences whose first byte is in the range.
|
||||
IO.print("søméஃthîng"[2..6]) // expect: méஃ
|
||||
IO.print("søméஃthîng"[2...6]) // expect: mé
|
||||
IO.print("søméஃthîng"[2...7]) // expect: méஃ
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "string"
|
||||
a[1.5..2] // expect runtime error: Range start must be an integer.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[3..2] // expect runtime error: Range start out of bounds.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[-4..2] // expect runtime error: Range start out of bounds.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[1...4] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[0...-5] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "string"
|
||||
a[1..2.5] // expect runtime error: Range end must be an integer.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[1..3] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,3 +1,2 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[0..-4] // expect runtime error: Range end out of bounds.
|
||||
|
||||
Reference in New Issue
Block a user