Get ranges working in string subscripts (again).

Now with UTF-8 hotness!
2026-01-11 06:08:41 +01:00 · 2015-09-01 22:14:55 -07:00
parent 2e83f056c1
commit 783a5b750a
15 changed files with 96 additions and 51 deletions
--- a/src/vm/wren_compiler.c
+++ b/src/vm/wren_compiler.c
@ -697,7 +697,7 @@ static void readUnicodeEscape(Parser* parser, ByteBuffer* string)
  int value = readHexEscape(parser, 4, "Unicode");

  // Grow the buffer enough for the encoded result.
-  int numBytes = wrenUtf8NumBytes(value);
+  int numBytes = wrenUtf8EncodeNumBytes(value);
  if (numBytes != 0)
  {
    wrenByteBufferFill(parser->vm, string, 0, numBytes);
--- a/src/vm/wren_core.c
+++ b/src/vm/wren_core.c
@ -604,7 +604,7 @@ DEF_PRIMITIVE(list_subscript)
  ObjList* result = wrenNewList(vm, count);
  for (uint32_t i = 0; i < count; i++)
  {
-    result->elements.data[i] = list->elements.data[start + (i * step)];
+    result->elements.data[i] = list->elements.data[start + i * step];
  }

  RETURN_OBJ(result);
@ -1229,23 +1229,12 @@ DEF_PRIMITIVE(string_subscript)
    RETURN_ERROR("Subscript must be a number or a range.");
  }

-  // TODO: Handle UTF-8 here.
-  /*
  int step;
-  int count = string->length;
+  uint32_t count = string->length;
  int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
  if (start == -1) return PRIM_ERROR;

-  ObjString* result = wrenNewUninitializedString(vm, count);
-  for (int i = 0; i < count; i++)
-  {
-    result->value[i] = string->value[start + (i * step)];
-  }
-  result->value[count] = '\0';
-
-  RETURN_OBJ(result);
-  */
-  RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
+  RETURN_VAL(wrenNewStringFromRange(vm, string, start, count, step));
 }

 DEF_PRIMITIVE(string_toString)
--- a/src/vm/wren_utils.c
+++ b/src/vm/wren_utils.c
@ -59,7 +59,7 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length)
  return -1;
 }

-int wrenUtf8NumBytes(int value)
+int wrenUtf8EncodeNumBytes(int value)
 {
  ASSERT(value >= 0, "Cannot encode a negative value.");
  
@ -70,12 +70,13 @@ int wrenUtf8NumBytes(int value)
  return 0;
 }

-void wrenUtf8Encode(int value, uint8_t* bytes)
+int wrenUtf8Encode(int value, uint8_t* bytes)
 {
  if (value <= 0x7f)
  {
    // Single byte (i.e. fits in ASCII).
    *bytes = value & 0x7f;
+    return 1;
  }
  else if (value <= 0x7ff)
  {
@ -83,6 +84,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
    *bytes = 0xc0 | ((value & 0x7c0) >> 6);
    bytes++;
    *bytes = 0x80 | (value & 0x3f);
+    return 2;
  }
  else if (value <= 0xffff)
  {
@ -92,6 +94,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
    *bytes = 0x80 | ((value & 0xfc0) >> 6);
    bytes++;
    *bytes = 0x80 | (value & 0x3f);
+    return 3;
  }
  else if (value <= 0x10ffff)
  {
@ -103,12 +106,11 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
    *bytes = 0x80 | ((value & 0xfc0) >> 6);
    bytes++;
    *bytes = 0x80 | (value & 0x3f);
+    return 4;
  }
-  else
-  {
-    // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
-    ASSERT(false, "Invalid UTF-8 value.");
-  }
+
+  // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
+  UNREACHABLE();
 }

 int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
@ -158,3 +160,19 @@ int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)

  return value;
 }
+
+int wrenUtf8DecodeNumBytes(const char* string, uint32_t index)
+{
+  char first = string[index];
+  
+  // If the byte starts with 10xxxxx, it's the middle of a UTF-8 sequence, so
+  // don't count it at all.
+  if ((first & 0xc0) == 0x80) return 0;
+  
+  // The first byte's high bits tell us how many bytes are in the UTF-8
+  // sequence.
+  if ((first & 0xf8) == 0xf0) return 4;
+  if ((first & 0xf0) == 0xe0) return 3;
+  if ((first & 0xe0) == 0xc0) return 2;
+  return 1;
+}
--- a/src/vm/wren_utils.h
+++ b/src/vm/wren_utils.h
@ -100,14 +100,25 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length);
 // Returns the number of bytes needed to encode [value] in UTF-8.
 //
 // Returns 0 if [value] is too large to encode.
-int wrenUtf8NumBytes(int value);
+int wrenUtf8EncodeNumBytes(int value);

 // Encodes value as a series of bytes in [bytes], which is assumed to be large
 // enough to hold the encoded result.
-void wrenUtf8Encode(int value, uint8_t* bytes);
+//
+// Returns the number of written bytes.
+int wrenUtf8Encode(int value, uint8_t* bytes);

-// Decodes the UTF-8 sequence in [bytes] (which has max [length]), returning
-// the code point.
+// Decodes the UTF-8 sequence starting at [bytes] (which has max [length]),
+// returning the code point.
+//
+// Returns -1 if the bytes are not a valid UTF-8 sequence.
 int wrenUtf8Decode(const uint8_t* bytes, uint32_t length);

+// Returns the number of bytes in the UTF-8 sequence starting at [index] in
+// [string].
+//
+// If the character at that index is not the beginning of a UTF-8 sequence,
+// returns 0.
+int wrenUtf8DecodeNumBytes(const char* string, uint32_t index);
+
 #endif
--- a/src/vm/wren_value.c
+++ b/src/vm/wren_value.c
@ -625,13 +625,41 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length)
  ObjString* string = allocateString(vm, length);

  // Copy the string (if given one).
-  if (length > 0) memcpy(string->value, text, length);
+  if (length > 0 && text != NULL) memcpy(string->value, text, length);

  hashString(string);
-
  return OBJ_VAL(string);
 }

+Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
+                             uint32_t count, int step)
+{
+  uint8_t* from = (uint8_t*)source->value;
+  int length = 0;
+  for (uint32_t i = 0; i < count; i++)
+  {
+    length += wrenUtf8EncodeNumBytes(from[start + i * step]);
+  }
+  
+  ObjString* result = allocateString(vm, length);
+  result->value[length] = '\0';
+  
+  uint8_t* to = (uint8_t*)result->value;
+  for (uint32_t i = 0; i < count; i++)
+  {
+    int index = start + i * step;
+    int codePoint = wrenUtf8Decode(from + index, source->length - index);
+    
+    if (codePoint != -1)
+    {
+      to += wrenUtf8Encode(codePoint, to);
+    }
+  }
+
+  hashString(result);
+  return OBJ_VAL(result);
+}
+
 Value wrenNumToString(WrenVM* vm, double value)
 {
  // Corner case: If the value is NaN, different versions of libc produce
@ -664,7 +692,7 @@ Value wrenNumToString(WrenVM* vm, double value)

 Value wrenStringFromCodePoint(WrenVM* vm, int value)
 {
-  int length = wrenUtf8NumBytes(value);
+  int length = wrenUtf8EncodeNumBytes(value);
  ASSERT(length != 0, "Value out of range.");

  ObjString* string = allocateString(vm, length);
@ -743,19 +771,8 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
 Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
 {
  ASSERT(index < string->length, "Index out of bounds.");
-
-  char first = string->value[index];
-
-  // The first byte's high bits tell us how many bytes are in the UTF-8
-  // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
-  // sequence, so return an empty string.
-  int numBytes;
-  if      ((first & 0xc0) == 0x80) numBytes = 0;
-  else if ((first & 0xf8) == 0xf0) numBytes = 4;
-  else if ((first & 0xf0) == 0xe0) numBytes = 3;
-  else if ((first & 0xe0) == 0xc0) numBytes = 2;
-  else numBytes = 1;
-
+  
+  int numBytes = wrenUtf8DecodeNumBytes(string->value, index);
  return wrenNewString(vm, string->value + index, numBytes);
 }

--- a/src/vm/wren_value.h
+++ b/src/vm/wren_value.h
@ -715,6 +715,12 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
 // [text] may be NULL if [length] is zero.
 Value wrenNewString(WrenVM* vm, const char* text, size_t length);

+// Creates a new string object by taking a range of characters from [source].
+// The range starts at [start], contains [count] bytes, and increments by
+// [step].
+Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
+                             uint32_t count, int step);
+
 // Produces a string representation of [value].
 Value wrenNumToString(WrenVM* vm, double value);

--- a/test/core/string/subscript_range.wren
+++ b/test/core/string/subscript_range.wren
@ -1,4 +1,3 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var string = "abcde"
 IO.print(string[0..0]) // expect: a
 IO.print(string[1...1] == "") // expect: true
@ -33,3 +32,16 @@ IO.print(string[3...-6]) // expect: dcba
 // An empty range at zero is allowed on an empty string.
 IO.print(""[0...0] == "") // expect: true
 IO.print(""[0..-1] == "") // expect: true
+
+// Indexes by byte, not code point.
+//
+// Bytes:           11111
+//        012345678901234
+// Chars: sø mé ஃ  thî ng
+IO.print("søméஃthîng"[0..3]) // expect: søm
+IO.print("søméஃthîng"[3...10]) // expect: méஃt
+
+// Only includes sequences whose first byte is in the range.
+IO.print("søméஃthîng"[2..6]) // expect: méஃ
+IO.print("søméஃthîng"[2...6]) // expect: mé
+IO.print("søméஃthîng"[2...7]) // expect: méஃ
--- a/test/core/string/subscript_range_from_not_int.wren
+++ b/test/core/string/subscript_range_from_not_int.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "string"
 a[1.5..2] // expect runtime error: Range start must be an integer.
--- a/test/core/string/subscript_range_from_too_large.wren
+++ b/test/core/string/subscript_range_from_too_large.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[3..2] // expect runtime error: Range start out of bounds.
--- a/test/core/string/subscript_range_from_too_small.wren
+++ b/test/core/string/subscript_range_from_too_small.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[-4..2] // expect runtime error: Range start out of bounds.
--- a/test/core/string/subscript_range_to_exclusive_too_large.wren
+++ b/test/core/string/subscript_range_to_exclusive_too_large.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[1...4] // expect runtime error: Range end out of bounds.
--- a/test/core/string/subscript_range_to_exclusive_too_small.wren
+++ b/test/core/string/subscript_range_to_exclusive_too_small.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[0...-5] // expect runtime error: Range end out of bounds.
--- a/test/core/string/subscript_range_to_not_int.wren
+++ b/test/core/string/subscript_range_to_not_int.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "string"
 a[1..2.5] // expect runtime error: Range end must be an integer.
--- a/test/core/string/subscript_range_to_too_large.wren
+++ b/test/core/string/subscript_range_to_too_large.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[1..3] // expect runtime error: Range end out of bounds.
--- a/test/core/string/subscript_range_to_too_small.wren
+++ b/test/core/string/subscript_range_to_too_small.wren
@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[0..-4] // expect runtime error: Range end out of bounds.