From 875ee9e55a288133c1cb23df9de06be866415501 Mon Sep 17 00:00:00 2001 From: Marco Lizza Date: Wed, 4 Feb 2015 13:38:15 +0100 Subject: [PATCH 1/5] Replacing 'strstr()' with 'wrenStringFind()' to support 8-bit strings. Internally the Boyer-Moore-Horspool algorithm is used. --- src/wren_core.c | 6 +++--- src/wren_value.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ src/wren_value.h | 5 +++++ 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/src/wren_core.c b/src/wren_core.c index dca76df8..76373e8b 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -1159,7 +1159,7 @@ DEF_NATIVE(string_contains) // Corner case, the empty string contains the empty string. if (string->length == 0 && search->length == 0) RETURN_TRUE; - RETURN_BOOL(strstr(string->value, search->value) != NULL); + RETURN_BOOL(wrenStringFind(vm, string, search) != string->length); } DEF_NATIVE(string_count) @@ -1191,9 +1191,9 @@ DEF_NATIVE(string_indexOf) ObjString* string = AS_STRING(args[0]); ObjString* search = AS_STRING(args[1]); - char* firstOccurrence = strstr(string->value, search->value); + uint32_t index = wrenStringFind(vm, string, search); - RETURN_NUM(firstOccurrence ? firstOccurrence - string->value : -1); + RETURN_NUM(index == string->length ? -1 : (int)index); } DEF_NATIVE(string_iterate) diff --git a/src/wren_value.c b/src/wren_value.c index 18eb0480..6af0cd01 100644 --- a/src/wren_value.c +++ b/src/wren_value.c @@ -670,6 +670,59 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index) return value; } +// Implementing Boyer-Moore-Horspool string matching algorithm. +uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle) +{ + uint32_t length, last_index, index; + uint32_t shift[256]; // Full 8-bit alphabet + char last_char; + // We need to convert the 'char' value to 'unsigned char' in order not + // to get negative indexes. We use a union as an elegant alternative to + // explicit coercion (whose behaviour is not sure across compilers) + union { + char c; + unsigned char uc; + } value; + + // If the needle is longer than the haystack it won't be found. + if (needle->length > haystack->length) + { + return haystack->length; + } + + // Precalculate shifts. + last_index = needle->length - 1; + + for (index = 0; index < 256; index++) + { + shift[index] = needle->length; + } + for (index = 0; index < last_index; index++) + { + value.c = needle->value[index]; + shift[value.uc] = last_index - index; + } + + // Search, left to right. + last_char = needle->value[last_index]; + + length = haystack->length - needle->length; + + for (index = 0; index <= length; ) + { + value.c = haystack->value[index + last_index]; + if ((last_char == value.c) && + memcmp(haystack->value + index, needle->value, last_index) == 0) + { + return index; + } + index += shift[value.uc]; // Convert, same as above. + } + + // Not found. + return haystack->length; +} + Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value) { Upvalue* upvalue = ALLOCATE(vm, Upvalue); diff --git a/src/wren_value.h b/src/wren_value.h index f5c1a24b..ac688d27 100644 --- a/src/wren_value.h +++ b/src/wren_value.h @@ -667,6 +667,11 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right); // empty string. Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index); +// Search for the first occurence of [needle] within [haystack] and returns its +// zero-based offset. Returns [haystack->length] if [haystack] does not contain +// [needle]. +uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle); + // Creates a new open upvalue pointing to [value] on the stack. Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value); From d3ca27fb9a949b5bc465571e38a7f1b5deca1495 Mon Sep 17 00:00:00 2001 From: Marco Lizza Date: Wed, 4 Feb 2015 13:44:39 +0100 Subject: [PATCH 2/5] Changing 'wrenStringConcat()' to support null-containing strings. --- src/wren_core.c | 20 ++++++++++++-------- src/wren_value.c | 14 ++++++++------ src/wren_value.h | 7 +++++-- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/wren_core.c b/src/wren_core.c index dca76df8..e6917b9e 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -175,7 +175,7 @@ static bool validateFn(WrenVM* vm, Value* args, int index, const char* argName) { if (IS_FN(args[index]) || IS_CLOSURE(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a function.")); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a function.", -1)); return false; } @@ -185,7 +185,7 @@ static bool validateNum(WrenVM* vm, Value* args, int index, const char* argName) { if (IS_NUM(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a number.")); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a number.", -1)); return false; } @@ -196,7 +196,7 @@ static bool validateIntValue(WrenVM* vm, Value* args, double value, { if (trunc(value) == value) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be an integer.")); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be an integer.", -1)); return false; } @@ -226,7 +226,7 @@ static int validateIndexValue(WrenVM* vm, Value* args, int count, double value, // Check bounds. if (index >= 0 && index < count) return index; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " out of bounds.")); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " out of bounds.", -1)); return -1; } @@ -263,7 +263,7 @@ static bool validateString(WrenVM* vm, Value* args, int index, { if (IS_STRING(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, " must be a string.")); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a string.", -1)); return false; } @@ -1056,8 +1056,9 @@ DEF_NATIVE(object_toString) else if (IS_INSTANCE(args[0])) { ObjInstance* instance = AS_INSTANCE(args[0]); - RETURN_OBJ(wrenStringConcat(vm, "instance of ", - instance->obj.classObj->name->value)); + ObjString* name = instance->obj.classObj->name; + RETURN_OBJ(wrenStringConcat(vm, "instance of ", -1, + name->value, name->length)); } RETURN_VAL(wrenNewString(vm, "", 8)); @@ -1253,7 +1254,10 @@ DEF_NATIVE(string_toString) DEF_NATIVE(string_plus) { if (!validateString(vm, args, 1, "Right operand")) return PRIM_ERROR; - RETURN_OBJ(wrenStringConcat(vm, AS_CSTRING(args[0]), AS_CSTRING(args[1]))); + ObjString* left = AS_STRING(args[0]); + ObjString* right = AS_STRING(args[1]); + RETURN_OBJ(wrenStringConcat(vm, left->value, left->length, + right->value, right->length)); } DEF_NATIVE(string_subscript) diff --git a/src/wren_value.c b/src/wren_value.c index 18eb0480..2499759f 100644 --- a/src/wren_value.c +++ b/src/wren_value.c @@ -86,7 +86,8 @@ ObjClass* wrenNewClass(WrenVM* vm, ObjClass* superclass, int numFields, wrenPushRoot(vm, (Obj*)name); // Create the metaclass. - ObjString* metaclassName = wrenStringConcat(vm, name->value, " metaclass"); + ObjString* metaclassName = wrenStringConcat(vm, name->value, name->length, + " metaclass", -1); wrenPushRoot(vm, (Obj*)metaclassName); ObjClass* metaclass = wrenNewSingleClass(vm, 0, metaclassName); @@ -632,15 +633,16 @@ Value wrenNewUninitializedString(WrenVM* vm, size_t length) return OBJ_VAL(string); } -ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right) +ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength, + const char* right, int rightLength) { - size_t leftLength = strlen(left); - size_t rightLength = strlen(right); + if (leftLength == -1) leftLength = (int)strlen(left); + if (rightLength == -1) rightLength = (int)strlen(right); Value value = wrenNewUninitializedString(vm, leftLength + rightLength); ObjString* string = AS_STRING(value); - strcpy(string->value, left); - strcpy(string->value + leftLength, right); + memcpy(string->value, left, leftLength); + memcpy(string->value + leftLength, right, rightLength); string->value[leftLength + rightLength] = '\0'; return string; diff --git a/src/wren_value.h b/src/wren_value.h index f5c1a24b..293903a6 100644 --- a/src/wren_value.h +++ b/src/wren_value.h @@ -659,8 +659,11 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length); // The caller is expected to fully initialize the buffer after calling. Value wrenNewUninitializedString(WrenVM* vm, size_t length); -// Creates a new string that is the concatenation of [left] and [right]. -ObjString* wrenStringConcat(WrenVM* vm, const char* left, const char* right); +// Creates a new string that is the concatenation of [left] and [right] (of +// length [leftLength] and [rightLength], respectively). If -1 is passed +// the string length is automatically calculated. +ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength, + const char* right, int rightLength); // Creates a new string containing the code point in [string] starting at byte // [index]. If [index] points into the middle of a UTF-8 sequence, returns an From 096e5cf33c87dc558c3d5c604c2425d9751e3e8d Mon Sep 17 00:00:00 2001 From: Marco Lizza Date: Wed, 4 Feb 2015 13:53:59 +0100 Subject: [PATCH 3/5] Fixing empty search string corner case. --- src/wren_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wren_core.c b/src/wren_core.c index 76373e8b..9fa118cc 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -1156,8 +1156,8 @@ DEF_NATIVE(string_contains) ObjString* string = AS_STRING(args[0]); ObjString* search = AS_STRING(args[1]); - // Corner case, the empty string contains the empty string. - if (string->length == 0 && search->length == 0) RETURN_TRUE; + // Corner case, the empty string is always contained. + if (search->length == 0) RETURN_TRUE; RETURN_BOOL(wrenStringFind(vm, string, search) != string->length); } From aa9cfffb3479faeae53c09ad23a6767957e97a0c Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Wed, 4 Feb 2015 08:15:35 -0800 Subject: [PATCH 4/5] Keep code within 80 columns. --- src/wren_core.c | 12 ++++++++---- src/wren_value.h | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/wren_core.c b/src/wren_core.c index 6477bf10..8d5420a1 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -175,7 +175,8 @@ static bool validateFn(WrenVM* vm, Value* args, int index, const char* argName) { if (IS_FN(args[index]) || IS_CLOSURE(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a function.", -1)); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, + " must be a function.", -1)); return false; } @@ -185,7 +186,8 @@ static bool validateNum(WrenVM* vm, Value* args, int index, const char* argName) { if (IS_NUM(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a number.", -1)); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, + " must be a number.", -1)); return false; } @@ -196,7 +198,8 @@ static bool validateIntValue(WrenVM* vm, Value* args, double value, { if (trunc(value) == value) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be an integer.", -1)); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, + " must be an integer.", -1)); return false; } @@ -263,7 +266,8 @@ static bool validateString(WrenVM* vm, Value* args, int index, { if (IS_STRING(args[index])) return true; - args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, " must be a string.", -1)); + args[0] = OBJ_VAL(wrenStringConcat(vm, argName, -1, + " must be a string.", -1)); return false; } diff --git a/src/wren_value.h b/src/wren_value.h index 293903a6..d2c59cd0 100644 --- a/src/wren_value.h +++ b/src/wren_value.h @@ -659,7 +659,7 @@ Value wrenNewString(WrenVM* vm, const char* text, size_t length); // The caller is expected to fully initialize the buffer after calling. Value wrenNewUninitializedString(WrenVM* vm, size_t length); -// Creates a new string that is the concatenation of [left] and [right] (of +// Creates a new string that is the concatenation of [left] and [right] (with // length [leftLength] and [rightLength], respectively). If -1 is passed // the string length is automatically calculated. ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength, From 878be6fb6b7ee1ed3d0b96a7821bfdc716857dcd Mon Sep 17 00:00:00 2001 From: Bob Nystrom Date: Wed, 4 Feb 2015 20:23:50 -0800 Subject: [PATCH 5/5] Clean up wrenStringFind() a bit. --- src/wren_core.c | 7 ++-- src/wren_value.c | 68 +++++++++++++++++++++------------------ src/wren_value.h | 2 +- test/string/index_of.wren | 8 +++++ 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/src/wren_core.c b/src/wren_core.c index 68ba963f..001a9c0c 100644 --- a/src/wren_core.c +++ b/src/wren_core.c @@ -1161,10 +1161,7 @@ DEF_NATIVE(string_contains) ObjString* string = AS_STRING(args[0]); ObjString* search = AS_STRING(args[1]); - // Corner case, the empty string is always contained. - if (search->length == 0) RETURN_TRUE; - - RETURN_BOOL(wrenStringFind(vm, string, search) != string->length); + RETURN_BOOL(wrenStringFind(vm, string, search) != UINT32_MAX); } DEF_NATIVE(string_count) @@ -1198,7 +1195,7 @@ DEF_NATIVE(string_indexOf) uint32_t index = wrenStringFind(vm, string, search); - RETURN_NUM(index == string->length ? -1 : (int)index); + RETURN_NUM(index == UINT32_MAX ? -1 : (int)index); } DEF_NATIVE(string_iterate) diff --git a/src/wren_value.c b/src/wren_value.c index 80d167e0..f6cdaa46 100644 --- a/src/wren_value.c +++ b/src/wren_value.c @@ -672,57 +672,63 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index) return value; } -// Implementing Boyer-Moore-Horspool string matching algorithm. +// Uses the Boyer-Moore-Horspool string matching algorithm. uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle) { - uint32_t length, last_index, index; - uint32_t shift[256]; // Full 8-bit alphabet - char last_char; - // We need to convert the 'char' value to 'unsigned char' in order not - // to get negative indexes. We use a union as an elegant alternative to - // explicit coercion (whose behaviour is not sure across compilers) - union { - char c; - unsigned char uc; - } value; + // Corner case, an empty needle is always found. + if (needle->length == 0) return 0; // If the needle is longer than the haystack it won't be found. - if (needle->length > haystack->length) - { - return haystack->length; - } + if (needle->length > haystack->length) return UINT32_MAX; - // Precalculate shifts. - last_index = needle->length - 1; + // Pre-calculate the shift table. For each character (8-bit value), we + // determine how far the search window can be advanced if that character is + // the last character in the haystack where we are searching for the needle + // and the needle doesn't match there. + uint32_t shift[UINT8_MAX]; + uint32_t needleEnd = needle->length - 1; - for (index = 0; index < 256; index++) + // By default, we assume the character is not the needle at all. In that case + // case, if a match fails on that character, we can advance one whole needle + // width since. + for (uint32_t index = 0; index < UINT8_MAX; index++) { shift[index] = needle->length; } - for (index = 0; index < last_index; index++) + + // Then, for every character in the needle, determine how far it is from the + // end. If a match fails on that character, we can advance the window such + // that it the last character in it lines up with the last place we could + // find it in the needle. + for (uint32_t index = 0; index < needleEnd; index++) { - value.c = needle->value[index]; - shift[value.uc] = last_index - index; + char c = needle->value[index]; + shift[(uint8_t)c] = needleEnd - index; } - // Search, left to right. - last_char = needle->value[last_index]; + // Slide the needle across the haystack, looking for the first match or + // stopping if the needle goes off the end. + char lastChar = needle->value[needleEnd]; + uint32_t range = haystack->length - needle->length; - length = haystack->length - needle->length; - - for (index = 0; index <= length; ) + for (uint32_t index = 0; index <= range; ) { - value.c = haystack->value[index + last_index]; - if ((last_char == value.c) && - memcmp(haystack->value + index, needle->value, last_index) == 0) + // Compare the last character in the haystack's window to the last character + // in the needle. If it matches, see if the whole needle matches. + char c = haystack->value[index + needleEnd]; + if (lastChar == c && + memcmp(haystack->value + index, needle->value, needleEnd) == 0) { + // Found a match. return index; } - index += shift[value.uc]; // Convert, same as above. + + // Otherwise, slide the needle forward. + index += shift[(uint8_t)c]; } // Not found. - return haystack->length; + return UINT32_MAX; } Upvalue* wrenNewUpvalue(WrenVM* vm, Value* value) diff --git a/src/wren_value.h b/src/wren_value.h index c99197b1..b50d1a0e 100644 --- a/src/wren_value.h +++ b/src/wren_value.h @@ -671,7 +671,7 @@ ObjString* wrenStringConcat(WrenVM* vm, const char* left, int leftLength, Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, int index); // Search for the first occurence of [needle] within [haystack] and returns its -// zero-based offset. Returns [haystack->length] if [haystack] does not contain +// zero-based offset. Returns `UINT32_MAX` if [haystack] does not contain // [needle]. uint32_t wrenStringFind(WrenVM* vm, ObjString* haystack, ObjString* needle); diff --git a/test/string/index_of.wren b/test/string/index_of.wren index 8ee67978..7c8ca35b 100644 --- a/test/string/index_of.wren +++ b/test/string/index_of.wren @@ -1,8 +1,16 @@ +IO.print("abcd".indexOf("")) // expect: 0 IO.print("abcd".indexOf("cd")) // expect: 2 IO.print("abcd".indexOf("a")) // expect: 0 +IO.print("abcd".indexOf("abcd")) // expect: 0 IO.print("abcd".indexOf("abcde")) // expect: -1 IO.print("abab".indexOf("ab")) // expect: 0 +// More complex cases. +IO.print("abcdefabcdefg".indexOf("defg")) // expect: 9 +IO.print("abcdabcdabcd".indexOf("dab")) // expect: 3 +IO.print("abcdabcdabcdabcd".indexOf("dabcdabc")) // expect: 3 +IO.print("abcdefg".indexOf("abcdef!")) // expect: -1 + // Non-ASCII. Note that it returns byte indices, not code points. IO.print("søméஃthîng".indexOf("e")) // expect: -1 IO.print("søméஃthîng".indexOf("m")) // expect: 3