From 875ee9e55a288133c1cb23df9de06be866415501 Mon Sep 17 00:00:00 2001
From: Marco Lizza <marco.lizza@gmail.com>
Date: Wed, 4 Feb 2015 13:38:15 +0100
Subject: [PATCH 1/5] Replacing 'strstr()' with 'wrenStringFind()' to support
 8-bit strings. Internally the Boyer-Moore-Horspool algorithm is used.

---
 src/wren_core.c  |  6 +++---
 src/wren_value.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/wren_value.h |  5 +++++
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/wren_core.c b/src/wren_core.c
index dca76df8..76373e8b 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -1159,7 +1159,7 @@ DEF_NATIVE(string_contains)
   // Corner case, the empty string contains the empty string.
   if (string->length == 0 && search->length == 0) RETURN_TRUE;
 
-  RETURN_BOOL(strstr(string->value, search->value) != NULL);
+  RETURN_BOOL(wrenStringFind(vm, string, search) != string->length);
 }
 
 DEF_NATIVE(string_count)
@@ -1191,9 +1191,9 @@ DEF_NATIVE(string_indexOf)
   ObjString* string = AS_STRING(args[0]);
   ObjString* search = AS_STRING(args[1]);
 
-  char* firstOccurrence = strstr(string->value, search->value);
+  uint32_t index = wrenStringFind(vm, string, search);
 
-  RETURN_NUM(firstOccurrence ? firstOccurrence - string->value : -1);
+  RETURN_NUM(index == string->length ? -1 : (int)index);
 }
 
 DEF_NATIVE(string_iterate)
diff --git a/src/wren_value.c b/src/wren_value.c
index 18eb0480..6af0cd01 100644
--- a/src/wren_value.c
+++ b/src/wren_value.c
@@ -670,6 +670,59 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index)
   return value;
 }
 
+// Implementing Boyer-Moore-Horspool string matching algorithm.
+uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle)
+{
+  uint32_t length, last_index, index;
+  uint32_t shift[256]; // Full 8-bit alphabet
+  char last_char;
+  // We need to convert the 'char' value to 'unsigned char' in order not
+  // to get negative indexes. We use a union as an elegant alternative to
+  // explicit coercion (whose behaviour is not sure across compilers)
+  union {
+    char c;
+    unsigned char uc;
+  } value;
+
+  // If the needle is longer than the haystack it won't be found.
+  if (needle->length > haystack->length)
+  {
+    return haystack->length;
+  }
+
+  // Precalculate shifts.
+  last_index = needle->length - 1;
+
+  for (index = 0; index < 256; index++)
+  {
+    shift[index] = needle->length;
+  }
+  for (index = 0; index < last_index; index++)
+  {
+    value.c = needle->value[index];
+    shift[value.uc] = last_index - index;
+  }
+
+  // Search, left to right.
+  last_char = needle->value[last_index];
+
+  length = haystack->length - needle->length;
+
+  for (index = 0; index <= length; )
+  {
+    value.c = haystack->value[index + last_index];
+    if ((last_char == value.c) &&
+      memcmp(haystack->value + index, needle->value, last_index) == 0)
+    {
+      return index;
+    }
+    index += shift[value.uc]; // Convert, same as above.
+  }
+
+  // Not found.
+  return haystack->length;
+}
+
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value)
 {
   Upvalue* upvalue = ALLOCATE(vm, Upvalue);
diff --git a/src/wren_value.h b/src/wren_value.h
index f5c1a24b..ac688d27 100644
--- a/src/wren_value.h
+++ b/src/wren_value.h
@@ -667,6 +667,11 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right);
 // empty string.
 Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index);
 
+// Search for the first occurence of [needle] within [haystack] and returns its
+// zero-based offset. Returns [haystack->length] if [haystack] does not contain
+// [needle].
+uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle);
+
 // Creates a new open upvalue pointing to [value] on the stack.
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value);
 

From d3ca27fb9a949b5bc465571e38a7f1b5deca1495 Mon Sep 17 00:00:00 2001
From: Marco Lizza <marco.lizza@gmail.com>
Date: Wed, 4 Feb 2015 13:44:39 +0100
Subject: [PATCH 2/5] Changing 'wrenStringConcat()' to support null-containing
 strings.

---
 src/wren_core.c  | 20 ++++++++++++--------
 src/wren_value.c | 14 ++++++++------
 src/wren_value.h |  7 +++++--
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/wren_core.c b/src/wren_core.c
index dca76df8..e6917b9e 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -175,7 +175,7 @@ static bool validateFn(WrenVM* vm, Value* args, int index, const char* argName)
 {
   if (IS_FN(args[index]) || IS_CLOSURE(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a function."));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a function.", -1));
   return false;
 }
 
@@ -185,7 +185,7 @@ static bool validateNum(WrenVM* vm, Value* args, int index, const char* argName)
 {
   if (IS_NUM(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a number."));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a number.", -1));
   return false;
 }
 
@@ -196,7 +196,7 @@ static bool validateIntValue(WrenVM* vm, Value* args, double value,
 {
   if (trunc(value) == value) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be an integer."));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be an integer.", -1));
   return false;
 }
 
@@ -226,7 +226,7 @@ static int validateIndexValue(WrenVM* vm, Value* args, int count, double value,
   // Check bounds.
   if (index >= 0 && index < count) return index;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " out of bounds."));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " out of bounds.", -1));
   return -1;
 }
 
@@ -263,7 +263,7 @@ static bool validateString(WrenVM* vm, Value* args, int index,
 {
   if (IS_STRING(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a string."));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a string.", -1));
   return false;
 }
 
@@ -1056,8 +1056,9 @@ DEF_NATIVE(object_toString)
   else if (IS_INSTANCE(args[0]))
   {
     ObjInstance* instance = AS_INSTANCE(args[0]);
-    RETURN_OBJ(wrenStringConcat(vm, "instance of ",
-                                instance->obj.classObj->name->value));
+    ObjString* name = instance->obj.classObj->name;
+    RETURN_OBJ(wrenStringConcat(vm, "instance of ", -1,
+                                name->value, name->length));
   }
 
   RETURN_VAL(wrenNewString(vm, "<object>", 8));
@@ -1253,7 +1254,10 @@ DEF_NATIVE(string_toString)
 DEF_NATIVE(string_plus)
 {
   if (!validateString(vm, args, 1, "Right operand")) return PRIM_ERROR;
-  RETURN_OBJ(wrenStringConcat(vm, AS_CSTRING(args[0]), AS_CSTRING(args[1])));
+  ObjString* left = AS_STRING(args[0]);
+  ObjString* right = AS_STRING(args[1]);
+  RETURN_OBJ(wrenStringConcat(vm, left->value, left->length,
+                              right->value, right->length));
 }
 
 DEF_NATIVE(string_subscript)
diff --git a/src/wren_value.c b/src/wren_value.c
index 18eb0480..2499759f 100644
--- a/src/wren_value.c
+++ b/src/wren_value.c
@@ -86,7 +86,8 @@ ObjClass* wrenNewClass(WrenVM* vm, ObjClass* superclass, int numFields,
   wrenPushRoot(vm, (Obj*)name);
 
   // Create the metaclass.
-  ObjString* metaclassName = wrenStringConcat(vm, name->value, " metaclass");
+  ObjString* metaclassName = wrenStringConcat(vm, name->value, name->length,
+                                              " metaclass", -1);
   wrenPushRoot(vm, (Obj*)metaclassName);
 
   ObjClass* metaclass = wrenNewSingleClass(vm, 0, metaclassName);
@@ -632,15 +633,16 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length)
   return OBJ_VAL(string);
 }
 
-ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right)
+ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength,
+                            const char* right, int rightLength)
 {
-  size_t leftLength = strlen(left);
-  size_t rightLength = strlen(right);
+  if (leftLength == -1) leftLength = (int)strlen(left);
+  if (rightLength == -1) rightLength = (int)strlen(right);
 
   Value value = wrenNewUninitializedString(vm, leftLength + rightLength);
   ObjString* string = AS_STRING(value);
-  strcpy(string->value, left);
-  strcpy(string->value + leftLength, right);
+  memcpy(string->value, left, leftLength);
+  memcpy(string->value + leftLength, right, rightLength);
   string->value[leftLength + rightLength] = '\0';
 
   return string;
diff --git a/src/wren_value.h b/src/wren_value.h
index f5c1a24b..293903a6 100644
--- a/src/wren_value.h
+++ b/src/wren_value.h
@@ -659,8 +659,11 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length);
 // The caller is expected to fully initialize the buffer after calling.
 Value wrenNewUninitializedString(WrenVM* vm, size_t length);
 
-// Creates a new string that is the concatenation of [left] and [right].
-ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right);
+// Creates a new string that is the concatenation of [left] and [right] (of
+// length [leftLength] and [rightLength], respectively). If -1 is passed
+// the string length is automatically calculated.
+ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength,
+                            const char* right, int rightLength);
 
 // Creates a new string containing the code point in [string] starting at byte
 // [index]. If [index] points into the middle of a UTF-8 sequence, returns an

From 096e5cf33c87dc558c3d5c604c2425d9751e3e8d Mon Sep 17 00:00:00 2001
From: Marco Lizza <marco.lizza@gmail.com>
Date: Wed, 4 Feb 2015 13:53:59 +0100
Subject: [PATCH 3/5] Fixing empty search string corner case.

---
 src/wren_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/wren_core.c b/src/wren_core.c
index 76373e8b..9fa118cc 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -1156,8 +1156,8 @@ DEF_NATIVE(string_contains)
   ObjString* string = AS_STRING(args[0]);
   ObjString* search = AS_STRING(args[1]);
 
-  // Corner case, the empty string contains the empty string.
-  if (string->length == 0 && search->length == 0) RETURN_TRUE;
+  // Corner case, the empty string is always contained.
+  if (search->length == 0) RETURN_TRUE;
 
   RETURN_BOOL(wrenStringFind(vm, string, search) != string->length);
 }

From aa9cfffb3479faeae53c09ad23a6767957e97a0c Mon Sep 17 00:00:00 2001
From: Bob Nystrom <robert@stuffwithstuff.com>
Date: Wed, 4 Feb 2015 08:15:35 -0800
Subject: [PATCH 4/5] Keep code within 80 columns.

---
 src/wren_core.c  | 12 ++++++++----
 src/wren_value.h |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/wren_core.c b/src/wren_core.c
index 6477bf10..8d5420a1 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -175,7 +175,8 @@ static bool validateFn(WrenVM* vm, Value* args, int index, const char* argName)
 {
   if (IS_FN(args[index]) || IS_CLOSURE(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a function.", -1));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1,
+                                     " must be a function.", -1));
   return false;
 }
 
@@ -185,7 +186,8 @@ static bool validateNum(WrenVM* vm, Value* args, int index, const char* argName)
 {
   if (IS_NUM(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a number.", -1));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1,
+                                     " must be a number.", -1));
   return false;
 }
 
@@ -196,7 +198,8 @@ static bool validateIntValue(WrenVM* vm, Value* args, double value,
 {
   if (trunc(value) == value) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be an integer.", -1));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1,
+                                     " must be an integer.", -1));
   return false;
 }
 
@@ -263,7 +266,8 @@ static bool validateString(WrenVM* vm, Value* args, int index,
 {
   if (IS_STRING(args[index])) return true;
 
-  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a string.", -1));
+  args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1,
+                                     " must be a string.", -1));
   return false;
 }
 
diff --git a/src/wren_value.h b/src/wren_value.h
index 293903a6..d2c59cd0 100644
--- a/src/wren_value.h
+++ b/src/wren_value.h
@@ -659,7 +659,7 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length);
 // The caller is expected to fully initialize the buffer after calling.
 Value wrenNewUninitializedString(WrenVM* vm, size_t length);
 
-// Creates a new string that is the concatenation of [left] and [right] (of
+// Creates a new string that is the concatenation of [left] and [right] (with
 // length [leftLength] and [rightLength], respectively). If -1 is passed
 // the string length is automatically calculated.
 ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength,

From 878be6fb6b7ee1ed3d0b96a7821bfdc716857dcd Mon Sep 17 00:00:00 2001
From: Bob Nystrom <robert@stuffwithstuff.com>
Date: Wed, 4 Feb 2015 20:23:50 -0800
Subject: [PATCH 5/5] Clean up wrenStringFind() a bit.

---
 src/wren_core.c           |  7 ++--
 src/wren_value.c          | 68 +++++++++++++++++++++------------------
 src/wren_value.h          |  2 +-
 test/string/index_of.wren |  8 +++++
 4 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/src/wren_core.c b/src/wren_core.c
index 68ba963f..001a9c0c 100644
--- a/src/wren_core.c
+++ b/src/wren_core.c
@@ -1161,10 +1161,7 @@ DEF_NATIVE(string_contains)
   ObjString* string = AS_STRING(args[0]);
   ObjString* search = AS_STRING(args[1]);
 
-  // Corner case, the empty string is always contained.
-  if (search->length == 0) RETURN_TRUE;
-
-  RETURN_BOOL(wrenStringFind(vm, string, search) != string->length);
+  RETURN_BOOL(wrenStringFind(vm, string, search) != UINT32_MAX);
 }
 
 DEF_NATIVE(string_count)
@@ -1198,7 +1195,7 @@ DEF_NATIVE(string_indexOf)
 
   uint32_t index = wrenStringFind(vm, string, search);
 
-  RETURN_NUM(index == string->length ? -1 : (int)index);
+  RETURN_NUM(index == UINT32_MAX ? -1 : (int)index);
 }
 
 DEF_NATIVE(string_iterate)
diff --git a/src/wren_value.c b/src/wren_value.c
index 80d167e0..f6cdaa46 100644
--- a/src/wren_value.c
+++ b/src/wren_value.c
@@ -672,57 +672,63 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index)
   return value;
 }
 
-// Implementing Boyer-Moore-Horspool string matching algorithm.
+// Uses the Boyer-Moore-Horspool string matching algorithm.
 uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle)
 {
-  uint32_t length, last_index, index;
-  uint32_t shift[256]; // Full 8-bit alphabet
-  char last_char;
-  // We need to convert the 'char' value to 'unsigned char' in order not
-  // to get negative indexes. We use a union as an elegant alternative to
-  // explicit coercion (whose behaviour is not sure across compilers)
-  union {
-    char c;
-    unsigned char uc;
-  } value;
+  // Corner case, an empty needle is always found.
+  if (needle->length == 0) return 0;
 
   // If the needle is longer than the haystack it won't be found.
-  if (needle->length > haystack->length)
-  {
-    return haystack->length;
-  }
+  if (needle->length > haystack->length) return UINT32_MAX;
 
-  // Precalculate shifts.
-  last_index = needle->length - 1;
+  // Pre-calculate the shift table. For each character (8-bit value), we
+  // determine how far the search window can be advanced if that character is
+  // the last character in the haystack where we are searching for the needle
+  // and the needle doesn't match there.
+  uint32_t shift[UINT8_MAX];
+  uint32_t needleEnd = needle->length - 1;
 
-  for (index = 0; index < 256; index++)
+  // By default, we assume the character is not the needle at all. In that case
+  // case, if a match fails on that character, we can advance one whole needle
+  // width since.
+  for (uint32_t index = 0; index < UINT8_MAX; index++)
   {
     shift[index] = needle->length;
   }
-  for (index = 0; index < last_index; index++)
+
+  // Then, for every character in the needle, determine how far it is from the
+  // end. If a match fails on that character, we can advance the window such
+  // that it the last character in it lines up with the last place we could
+  // find it in the needle.
+  for (uint32_t index = 0; index < needleEnd; index++)
   {
-    value.c = needle->value[index];
-    shift[value.uc] = last_index - index;
+    char c = needle->value[index];
+    shift[(uint8_t)c] = needleEnd - index;
   }
 
-  // Search, left to right.
-  last_char = needle->value[last_index];
+  // Slide the needle across the haystack, looking for the first match or
+  // stopping if the needle goes off the end.
+  char lastChar = needle->value[needleEnd];
+  uint32_t range = haystack->length - needle->length;
 
-  length = haystack->length - needle->length;
-
-  for (index = 0; index <= length; )
+  for (uint32_t index = 0; index <= range; )
   {
-    value.c = haystack->value[index + last_index];
-    if ((last_char == value.c) &&
-      memcmp(haystack->value + index, needle->value, last_index) == 0)
+    // Compare the last character in the haystack's window to the last character
+    // in the needle. If it matches, see if the whole needle matches.
+    char c = haystack->value[index + needleEnd];
+    if (lastChar == c &&
+        memcmp(haystack->value + index, needle->value, needleEnd) == 0)
     {
+      // Found a match.
       return index;
     }
-    index += shift[value.uc]; // Convert, same as above.
+
+    // Otherwise, slide the needle forward.
+    index += shift[(uint8_t)c];
   }
 
   // Not found.
-  return haystack->length;
+  return UINT32_MAX;
 }
 
 Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value)
diff --git a/src/wren_value.h b/src/wren_value.h
index c99197b1..b50d1a0e 100644
--- a/src/wren_value.h
+++ b/src/wren_value.h
@@ -671,7 +671,7 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength,
 Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index);
 
 // Search for the first occurence of [needle] within [haystack] and returns its
-// zero-based offset. Returns [haystack->length] if [haystack] does not contain
+// zero-based offset. Returns `UINT32_MAX` if [haystack] does not contain
 // [needle].
 uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle);
 
diff --git a/test/string/index_of.wren b/test/string/index_of.wren
index 8ee67978..7c8ca35b 100644
--- a/test/string/index_of.wren
+++ b/test/string/index_of.wren
@@ -1,8 +1,16 @@
+IO.print("abcd".indexOf("")) // expect: 0
 IO.print("abcd".indexOf("cd")) // expect: 2
 IO.print("abcd".indexOf("a")) // expect: 0
+IO.print("abcd".indexOf("abcd")) // expect: 0
 IO.print("abcd".indexOf("abcde")) // expect: -1
 IO.print("abab".indexOf("ab")) // expect: 0
 
+// More complex cases.
+IO.print("abcdefabcdefg".indexOf("defg")) // expect: 9
+IO.print("abcdabcdabcd".indexOf("dab")) // expect: 3
+IO.print("abcdabcdabcdabcd".indexOf("dabcdabc")) // expect: 3
+IO.print("abcdefg".indexOf("abcdef!")) // expect: -1
+
 // Non-ASCII. Note that it returns byte indices, not code points.
 IO.print("søméஃthîng".indexOf("e")) // expect: -1
 IO.print("søméஃthîng".indexOf("m")) // expect: 3