From 783a5b750aa610670114fe2f8f00682b6a50a0f0 Mon Sep 17 00:00:00 2001
From: Bob Nystrom <robert@stuffwithstuff.com>
Date: Tue, 1 Sep 2015 22:14:55 -0700
Subject: [PATCH] Get ranges working in string subscripts (again).

Now with UTF-8 hotness!
---
 src/vm/wren_compiler.c                        |  2 +-
 src/vm/wren_core.c                            | 17 ++-----
 src/vm/wren_utils.c                           | 32 +++++++++---
 src/vm/wren_utils.h                           | 19 +++++--
 src/vm/wren_value.c                           | 49 +++++++++++++------
 src/vm/wren_value.h                           |  6 +++
 test/core/string/subscript_range.wren         | 14 +++++-
 .../string/subscript_range_from_not_int.wren  |  1 -
 .../subscript_range_from_too_large.wren       |  1 -
 .../subscript_range_from_too_small.wren       |  1 -
 ...ubscript_range_to_exclusive_too_large.wren |  1 -
 ...ubscript_range_to_exclusive_too_small.wren |  1 -
 .../string/subscript_range_to_not_int.wren    |  1 -
 .../string/subscript_range_to_too_large.wren  |  1 -
 .../string/subscript_range_to_too_small.wren  |  1 -
 15 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/src/vm/wren_compiler.c b/src/vm/wren_compiler.c
index 49c9bd94..d4165773 100644
--- a/src/vm/wren_compiler.c
+++ b/src/vm/wren_compiler.c
@@ -697,7 +697,7 @@ static void readUnicodeEscape(Parser* parser, ByteBuffer* string)
   int value = readHexEscape(parser, 4, "Unicode");
 
   // Grow the buffer enough for the encoded result.
-  int numBytes = wrenUtf8NumBytes(value);
+  int numBytes = wrenUtf8EncodeNumBytes(value);
   if (numBytes != 0)
   {
     wrenByteBufferFill(parser->vm, string, 0, numBytes);
diff --git a/src/vm/wren_core.c b/src/vm/wren_core.c
index fe1b930d..8484ae6c 100644
--- a/src/vm/wren_core.c
+++ b/src/vm/wren_core.c
@@ -604,7 +604,7 @@ DEF_PRIMITIVE(list_subscript)
   ObjList* result = wrenNewList(vm, count);
   for (uint32_t i = 0; i < count; i++)
   {
-    result->elements.data[i] = list->elements.data[start + (i * step)];
+    result->elements.data[i] = list->elements.data[start + i * step];
   }
 
   RETURN_OBJ(result);
@@ -1229,23 +1229,12 @@ DEF_PRIMITIVE(string_subscript)
     RETURN_ERROR("Subscript must be a number or a range.");
   }
 
-  // TODO: Handle UTF-8 here.
-  /*
   int step;
-  int count = string->length;
+  uint32_t count = string->length;
   int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
   if (start == -1) return PRIM_ERROR;
 
-  ObjString* result = wrenNewUninitializedString(vm, count);
-  for (int i = 0; i < count; i++)
-  {
-    result->value[i] = string->value[start + (i * step)];
-  }
-  result->value[count] = '\0';
-
-  RETURN_OBJ(result);
-  */
-  RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
+  RETURN_VAL(wrenNewStringFromRange(vm, string, start, count, step));
 }
 
 DEF_PRIMITIVE(string_toString)
diff --git a/src/vm/wren_utils.c b/src/vm/wren_utils.c
index 32a2b3a7..b5c31888 100644
--- a/src/vm/wren_utils.c
+++ b/src/vm/wren_utils.c
@@ -59,7 +59,7 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length)
   return -1;
 }
 
-int wrenUtf8NumBytes(int value)
+int wrenUtf8EncodeNumBytes(int value)
 {
   ASSERT(value >= 0, "Cannot encode a negative value.");
   
@@ -70,12 +70,13 @@ int wrenUtf8NumBytes(int value)
   return 0;
 }
 
-void wrenUtf8Encode(int value, uint8_t* bytes)
+int wrenUtf8Encode(int value, uint8_t* bytes)
 {
   if (value <= 0x7f)
   {
     // Single byte (i.e. fits in ASCII).
     *bytes = value & 0x7f;
+    return 1;
   }
   else if (value <= 0x7ff)
   {
@@ -83,6 +84,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
     *bytes = 0xc0 | ((value & 0x7c0) >> 6);
     bytes++;
     *bytes = 0x80 | (value & 0x3f);
+    return 2;
   }
   else if (value <= 0xffff)
   {
@@ -92,6 +94,7 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
     *bytes = 0x80 | ((value & 0xfc0) >> 6);
     bytes++;
     *bytes = 0x80 | (value & 0x3f);
+    return 3;
   }
   else if (value <= 0x10ffff)
   {
@@ -103,12 +106,11 @@ void wrenUtf8Encode(int value, uint8_t* bytes)
     *bytes = 0x80 | ((value & 0xfc0) >> 6);
     bytes++;
     *bytes = 0x80 | (value & 0x3f);
+    return 4;
   }
-  else
-  {
-    // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
-    ASSERT(false, "Invalid UTF-8 value.");
-  }
+
+  // Invalid Unicode value. See: http://tools.ietf.org/html/rfc3629
+  UNREACHABLE();
 }
 
 int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
@@ -158,3 +160,19 @@ int wrenUtf8Decode(const uint8_t* bytes, uint32_t length)
 
   return value;
 }
+
+int wrenUtf8DecodeNumBytes(const char* string, uint32_t index)
+{
+  char first = string[index];
+  
+  // If the byte starts with 10xxxxx, it's the middle of a UTF-8 sequence, so
+  // don't count it at all.
+  if ((first & 0xc0) == 0x80) return 0;
+  
+  // The first byte's high bits tell us how many bytes are in the UTF-8
+  // sequence.
+  if ((first & 0xf8) == 0xf0) return 4;
+  if ((first & 0xf0) == 0xe0) return 3;
+  if ((first & 0xe0) == 0xc0) return 2;
+  return 1;
+}
diff --git a/src/vm/wren_utils.h b/src/vm/wren_utils.h
index f81063f3..469162c1 100644
--- a/src/vm/wren_utils.h
+++ b/src/vm/wren_utils.h
@@ -100,14 +100,25 @@ int wrenSymbolTableFind(SymbolTable* symbols, const char* name, size_t length);
 // Returns the number of bytes needed to encode [value] in UTF-8.
 //
 // Returns 0 if [value] is too large to encode.
-int wrenUtf8NumBytes(int value);
+int wrenUtf8EncodeNumBytes(int value);
 
 // Encodes value as a series of bytes in [bytes], which is assumed to be large
 // enough to hold the encoded result.
-void wrenUtf8Encode(int value, uint8_t* bytes);
+//
+// Returns the number of written bytes.
+int wrenUtf8Encode(int value, uint8_t* bytes);
 
-// Decodes the UTF-8 sequence in [bytes] (which has max [length]), returning
-// the code point.
+// Decodes the UTF-8 sequence starting at [bytes] (which has max [length]),
+// returning the code point.
+//
+// Returns -1 if the bytes are not a valid UTF-8 sequence.
 int wrenUtf8Decode(const uint8_t* bytes, uint32_t length);
 
+// Returns the number of bytes in the UTF-8 sequence starting at [index] in
+// [string].
+//
+// If the character at that index is not the beginning of a UTF-8 sequence,
+// returns 0.
+int wrenUtf8DecodeNumBytes(const char* string, uint32_t index);
+
 #endif
diff --git a/src/vm/wren_value.c b/src/vm/wren_value.c
index e4b75cf7..dec3c080 100644
--- a/src/vm/wren_value.c
+++ b/src/vm/wren_value.c
@@ -625,13 +625,41 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length)
   ObjString* string = allocateString(vm, length);
 
   // Copy the string (if given one).
-  if (length > 0) memcpy(string->value, text, length);
+  if (length > 0 && text != NULL) memcpy(string->value, text, length);
 
   hashString(string);
-
   return OBJ_VAL(string);
 }
 
+Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
+                             uint32_t count, int step)
+{
+  uint8_t* from = (uint8_t*)source->value;
+  int length = 0;
+  for (uint32_t i = 0; i < count; i++)
+  {
+    length += wrenUtf8EncodeNumBytes(from[start + i * step]);
+  }
+  
+  ObjString* result = allocateString(vm, length);
+  result->value[length] = '\0';
+  
+  uint8_t* to = (uint8_t*)result->value;
+  for (uint32_t i = 0; i < count; i++)
+  {
+    int index = start + i * step;
+    int codePoint = wrenUtf8Decode(from + index, source->length - index);
+    
+    if (codePoint != -1)
+    {
+      to += wrenUtf8Encode(codePoint, to);
+    }
+  }
+
+  hashString(result);
+  return OBJ_VAL(result);
+}
+
 Value wrenNumToString(WrenVM* vm, double value)
 {
   // Corner case: If the value is NaN, different versions of libc produce
@@ -664,7 +692,7 @@ Value wrenNumToString(WrenVM* vm, double value)
 
 Value wrenStringFromCodePoint(WrenVM* vm, int value)
 {
-  int length = wrenUtf8NumBytes(value);
+  int length = wrenUtf8EncodeNumBytes(value);
   ASSERT(length != 0, "Value out of range.");
 
   ObjString* string = allocateString(vm, length);
@@ -743,19 +771,8 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
 Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
 {
   ASSERT(index < string->length, "Index out of bounds.");
-
-  char first = string->value[index];
-
-  // The first byte's high bits tell us how many bytes are in the UTF-8
-  // sequence. If the byte starts with 10xxxxx, it's the middle of a UTF-8
-  // sequence, so return an empty string.
-  int numBytes;
-  if      ((first & 0xc0) == 0x80) numBytes = 0;
-  else if ((first & 0xf8) == 0xf0) numBytes = 4;
-  else if ((first & 0xf0) == 0xe0) numBytes = 3;
-  else if ((first & 0xe0) == 0xc0) numBytes = 2;
-  else numBytes = 1;
-
+  
+  int numBytes = wrenUtf8DecodeNumBytes(string->value, index);
   return wrenNewString(vm, string->value + index, numBytes);
 }
 
diff --git a/src/vm/wren_value.h b/src/vm/wren_value.h
index f453bcd7..442180d2 100644
--- a/src/vm/wren_value.h
+++ b/src/vm/wren_value.h
@@ -715,6 +715,12 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
 // [text] may be NULL if [length] is zero.
 Value wrenNewString(WrenVM* vm, const char* text, size_t length);
 
+// Creates a new string object by taking a range of characters from [source].
+// The range starts at [start], contains [count] bytes, and increments by
+// [step].
+Value wrenNewStringFromRange(WrenVM* vm, ObjString* source, int start,
+                             uint32_t count, int step);
+
 // Produces a string representation of [value].
 Value wrenNumToString(WrenVM* vm, double value);
 
diff --git a/test/core/string/subscript_range.wren b/test/core/string/subscript_range.wren
index 2aa824a2..d78359b8 100644
--- a/test/core/string/subscript_range.wren
+++ b/test/core/string/subscript_range.wren
@@ -1,4 +1,3 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var string = "abcde"
 IO.print(string[0..0]) // expect: a
 IO.print(string[1...1] == "") // expect: true
@@ -33,3 +32,16 @@ IO.print(string[3...-6]) // expect: dcba
 // An empty range at zero is allowed on an empty string.
 IO.print(""[0...0] == "") // expect: true
 IO.print(""[0..-1] == "") // expect: true
+
+// Indexes by byte, not code point.
+//
+// Bytes:           11111
+//        012345678901234
+// Chars: sø mé ஃ  thî ng
+IO.print("søméஃthîng"[0..3]) // expect: søm
+IO.print("søméஃthîng"[3...10]) // expect: méஃt
+
+// Only includes sequences whose first byte is in the range.
+IO.print("søméஃthîng"[2..6]) // expect: méஃ
+IO.print("søméஃthîng"[2...6]) // expect: mé
+IO.print("søméஃthîng"[2...7]) // expect: méஃ
diff --git a/test/core/string/subscript_range_from_not_int.wren b/test/core/string/subscript_range_from_not_int.wren
index f397f5c9..1c46f0fe 100644
--- a/test/core/string/subscript_range_from_not_int.wren
+++ b/test/core/string/subscript_range_from_not_int.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "string"
 a[1.5..2] // expect runtime error: Range start must be an integer.
diff --git a/test/core/string/subscript_range_from_too_large.wren b/test/core/string/subscript_range_from_too_large.wren
index e91bed09..e0240d6a 100644
--- a/test/core/string/subscript_range_from_too_large.wren
+++ b/test/core/string/subscript_range_from_too_large.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[3..2] // expect runtime error: Range start out of bounds.
diff --git a/test/core/string/subscript_range_from_too_small.wren b/test/core/string/subscript_range_from_too_small.wren
index b0460ef1..3daec49c 100644
--- a/test/core/string/subscript_range_from_too_small.wren
+++ b/test/core/string/subscript_range_from_too_small.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[-4..2] // expect runtime error: Range start out of bounds.
diff --git a/test/core/string/subscript_range_to_exclusive_too_large.wren b/test/core/string/subscript_range_to_exclusive_too_large.wren
index cc5307cf..3f34520b 100644
--- a/test/core/string/subscript_range_to_exclusive_too_large.wren
+++ b/test/core/string/subscript_range_to_exclusive_too_large.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[1...4] // expect runtime error: Range end out of bounds.
diff --git a/test/core/string/subscript_range_to_exclusive_too_small.wren b/test/core/string/subscript_range_to_exclusive_too_small.wren
index 747a7313..2217e817 100644
--- a/test/core/string/subscript_range_to_exclusive_too_small.wren
+++ b/test/core/string/subscript_range_to_exclusive_too_small.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[0...-5] // expect runtime error: Range end out of bounds.
diff --git a/test/core/string/subscript_range_to_not_int.wren b/test/core/string/subscript_range_to_not_int.wren
index ca2d12f4..059b2907 100644
--- a/test/core/string/subscript_range_to_not_int.wren
+++ b/test/core/string/subscript_range_to_not_int.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "string"
 a[1..2.5] // expect runtime error: Range end must be an integer.
diff --git a/test/core/string/subscript_range_to_too_large.wren b/test/core/string/subscript_range_to_too_large.wren
index 701e3995..d0cbaaca 100644
--- a/test/core/string/subscript_range_to_too_large.wren
+++ b/test/core/string/subscript_range_to_too_large.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[1..3] // expect runtime error: Range end out of bounds.
diff --git a/test/core/string/subscript_range_to_too_small.wren b/test/core/string/subscript_range_to_too_small.wren
index 9dcc5240..71f241a4 100644
--- a/test/core/string/subscript_range_to_too_small.wren
+++ b/test/core/string/subscript_range_to_too_small.wren
@@ -1,3 +1,2 @@
-// skip: Range subscripts for strings don't handle UTF-8.
 var a = "123"
 a[0..-4] // expect runtime error: Range end out of bounds.