forked from Mirror/wren
Store hash code in strings.
Makes string equality and string map keys much faster. Also did some other general string clean-up.
This commit is contained in:
@ -75,6 +75,8 @@ BENCHMARK("map_numeric", r"""500000500000""")
|
||||
|
||||
BENCHMARK("map_string", r"""3645600""")
|
||||
|
||||
BENCHMARK("string_equals", r"""3000000""")
|
||||
|
||||
LANGUAGES = [
|
||||
("wren", [os.path.join(WREN_DIR, 'wren')], ".wren"),
|
||||
("lua", ["lua"], ".lua"),
|
||||
|
||||
@ -1401,6 +1401,7 @@ DEF_PRIMITIVE(string_subscript)
|
||||
}
|
||||
|
||||
// TODO: Handle UTF-8 here.
|
||||
/*
|
||||
int step;
|
||||
int count = string->length;
|
||||
int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
|
||||
@ -1414,6 +1415,8 @@ DEF_PRIMITIVE(string_subscript)
|
||||
result->value[count] = '\0';
|
||||
|
||||
RETURN_OBJ(result);
|
||||
*/
|
||||
RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
|
||||
}
|
||||
|
||||
static ObjClass* defineSingleClass(WrenVM* vm, const char* name)
|
||||
|
||||
@ -357,29 +357,7 @@ static uint32_t hashObject(Obj* object)
|
||||
}
|
||||
|
||||
case OBJ_STRING:
|
||||
{
|
||||
ObjString* string = (ObjString*)object;
|
||||
|
||||
// FNV-1a hash. See: http://www.isthe.com/chongo/tech/comp/fnv/
|
||||
uint32_t hash = 2166136261u;
|
||||
|
||||
// We want the contents of the string to affect the hash, but we also
|
||||
// want to ensure it runs in constant time. We also don't want to bias
|
||||
// towards the prefix or suffix of the string. So sample up to eight
|
||||
// characters spread throughout the string.
|
||||
// TODO: Tune this.
|
||||
if (string->length > 0)
|
||||
{
|
||||
uint32_t step = 1 + 7 / string->length;
|
||||
for (uint32_t i = 0; i < string->length; i += step)
|
||||
{
|
||||
hash ^= string->value[i];
|
||||
hash *= 16777619;
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
return ((ObjString*)object)->hash;
|
||||
|
||||
default:
|
||||
ASSERT(false, "Only immutable objects can be hashed.");
|
||||
@ -616,32 +594,61 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive)
|
||||
return OBJ_VAL(range);
|
||||
}
|
||||
|
||||
// Creates a new string object with a null-terminated buffer large enough to
|
||||
// hold a string of [length] but does not fill in the bytes.
|
||||
//
|
||||
// The caller is expected to fill in the buffer and then calculate the string's
|
||||
// hash.
|
||||
static ObjString* allocateString(WrenVM* vm, size_t length)
|
||||
{
|
||||
ObjString* string = ALLOCATE_FLEX(vm, ObjString, char, length + 1);
|
||||
initObj(vm, &string->obj, OBJ_STRING, vm->stringClass);
|
||||
string->length = (int)length;
|
||||
string->value[length] = '\0';
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
// Calculates and stores the hash code for [string].
|
||||
static void hashString(ObjString* string)
|
||||
{
|
||||
// FNV-1a hash. See: http://www.isthe.com/chongo/tech/comp/fnv/
|
||||
uint32_t hash = 2166136261u;
|
||||
|
||||
// We want the contents of the string to affect the hash, but we also
|
||||
// want to ensure it runs in constant time. We also don't want to bias
|
||||
// towards the prefix or suffix of the string. So sample up to eight
|
||||
// characters spread throughout the string.
|
||||
// TODO: Tune this.
|
||||
if (string->length > 0)
|
||||
{
|
||||
uint32_t step = 1 + 7 / string->length;
|
||||
for (uint32_t i = 0; i < string->length; i += step)
|
||||
{
|
||||
hash ^= string->value[i];
|
||||
hash *= 16777619;
|
||||
}
|
||||
}
|
||||
|
||||
string->hash = hash;
|
||||
}
|
||||
|
||||
Value wrenNewString(WrenVM* vm, const char* text, size_t length)
|
||||
{
|
||||
// Allow NULL if the string is empty since byte buffers don't allocate any
|
||||
// characters for a zero-length string.
|
||||
ASSERT(length == 0 || text != NULL, "Unexpected NULL string.");
|
||||
|
||||
// TODO: Don't allocate a heap string at all for zero-length strings.
|
||||
ObjString* string = wrenNewUninitializedString(vm, length);
|
||||
ObjString* string = allocateString(vm, length);
|
||||
|
||||
// Copy the string (if given one).
|
||||
if (length > 0) memcpy(string->value, text, length);
|
||||
|
||||
string->value[length] = '\0';
|
||||
hashString(string);
|
||||
|
||||
return OBJ_VAL(string);
|
||||
}
|
||||
|
||||
ObjString* wrenNewUninitializedString(WrenVM* vm, size_t length)
|
||||
{
|
||||
ObjString* string = ALLOCATE_FLEX(vm, ObjString, char, length + 1);
|
||||
initObj(vm, &string->obj, OBJ_STRING, vm->stringClass);
|
||||
string->length = (int)length;
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
Value wrenNumToString(WrenVM* vm, double value)
|
||||
{
|
||||
// Corner case: If the value is NaN, different versions of libc produce
|
||||
@ -700,7 +707,7 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
|
||||
va_end(argList);
|
||||
|
||||
// Concatenate the string.
|
||||
ObjString* result = wrenNewUninitializedString(vm, totalLength);
|
||||
ObjString* result = allocateString(vm, totalLength);
|
||||
|
||||
va_start(argList, format);
|
||||
char* start = result->value;
|
||||
@ -732,7 +739,7 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
|
||||
}
|
||||
va_end(argList);
|
||||
|
||||
*start = '\0';
|
||||
hashString(result);
|
||||
|
||||
return OBJ_VAL(result);
|
||||
}
|
||||
@ -753,10 +760,7 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
|
||||
else if ((first & 0xe0) == 0xc0) numBytes = 2;
|
||||
else numBytes = 1;
|
||||
|
||||
ObjString* result = wrenNewUninitializedString(vm, numBytes);
|
||||
memcpy(result->value, string->value + index, numBytes);
|
||||
result->value[numBytes] = '\0';
|
||||
return OBJ_VAL(result);
|
||||
return wrenNewString(vm, string->value + index, numBytes);
|
||||
}
|
||||
|
||||
// Uses the Boyer-Moore-Horspool string matching algorithm.
|
||||
@ -1135,6 +1139,7 @@ bool wrenValuesEqual(Value a, Value b)
|
||||
ObjString* aString = (ObjString*)aObj;
|
||||
ObjString* bString = (ObjString*)bObj;
|
||||
return aString->length == bString->length &&
|
||||
aString->hash == bString->hash &&
|
||||
memcmp(aString->value, bString->value, aString->length) == 0;
|
||||
}
|
||||
|
||||
|
||||
@ -109,6 +109,7 @@ typedef struct
|
||||
Obj obj;
|
||||
// Does not include the null terminator.
|
||||
uint32_t length;
|
||||
uint32_t hash;
|
||||
char value[FLEXIBLE_ARRAY];
|
||||
} ObjString;
|
||||
|
||||
@ -485,6 +486,10 @@ typedef struct
|
||||
// Returns true if [value] is a string object.
|
||||
#define IS_STRING(value) (wrenIsObjType(value, OBJ_STRING))
|
||||
|
||||
// Creates a new string object from [text], which should be a bare C string
|
||||
// literal. This determines the length of the string automatically at compile
|
||||
// time based on the size of the character array -1 for the terminating '\0'.
|
||||
#define CONST_STRING(vm, text) wrenNewString((vm), (text), sizeof(text) - 1)
|
||||
|
||||
// An IEEE 754 double-precision float is a 64-bit value with bits laid out like:
|
||||
//
|
||||
@ -697,22 +702,11 @@ ObjModule* wrenNewModule(WrenVM* vm);
|
||||
// Creates a new range from [from] to [to].
|
||||
Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
|
||||
|
||||
// Creates a new string object from [text], which should be a bare C string
|
||||
// literal. This determines the length of the string automatically at compile
|
||||
// time based on the size of the character array -1 for the terminating '\0'.
|
||||
#define CONST_STRING(vm, text) wrenNewString((vm), (text), sizeof(text) - 1)
|
||||
|
||||
// Creates a new string object of [length] and copies [text] into it.
|
||||
//
|
||||
// [text] may be NULL if [length] is zero.
|
||||
Value wrenNewString(WrenVM* vm, const char* text, size_t length);
|
||||
|
||||
// Creates a new string object with a buffer large enough to hold a string of
|
||||
// [length] but does no initialization of the buffer.
|
||||
//
|
||||
// The caller is expected to fully initialize the buffer after calling.
|
||||
ObjString* wrenNewUninitializedString(WrenVM* vm, size_t length);
|
||||
|
||||
// Produces a string representation of [value].
|
||||
Value wrenNumToString(WrenVM* vm, double value);
|
||||
|
||||
|
||||
24
test/benchmark/string_equals.wren
Normal file
24
test/benchmark/string_equals.wren
Normal file
@ -0,0 +1,24 @@
|
||||
var start = IO.clock
|
||||
|
||||
var count = 0
|
||||
for (i in 1..1000000) {
|
||||
if ("abc" == "abc") count = count + 1
|
||||
if ("a slightly longer string" ==
|
||||
"a slightly longer string") count = count + 1
|
||||
if ("a significantly longer string but still not overwhelmingly long string" ==
|
||||
"a significantly longer string but still not overwhelmingly long string") count = count + 1
|
||||
|
||||
if ("" == "abc") count = count + 1
|
||||
if ("abc" == "abcd") count = count + 1
|
||||
if ("changed one character" == "changed %ne character") count = count + 1
|
||||
if ("123" == 123) count = count + 1
|
||||
if ("a slightly longer string" ==
|
||||
"a slightly longer string!") count = count + 1
|
||||
if ("a slightly longer string" ==
|
||||
"a slightly longer strinh") count = count + 1
|
||||
if ("a significantly longer string but still not overwhelmingly long string" ==
|
||||
"another") count = count + 1
|
||||
}
|
||||
|
||||
IO.print(count)
|
||||
IO.print("elapsed: ", IO.clock - start)
|
||||
@ -1,3 +1,4 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var string = "abcde"
|
||||
IO.print(string[0..0]) // expect: a
|
||||
IO.print(string[1...1] == "") // expect: true
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "string"
|
||||
a[1.5..2] // expect runtime error: Range start must be an integer.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[3..2] // expect runtime error: Range start out of bounds.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[-4..2] // expect runtime error: Range start out of bounds.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[1...4] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[0...-5] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "string"
|
||||
a[1..2.5] // expect runtime error: Range end must be an integer.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[1..3] // expect runtime error: Range end out of bounds.
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
// skip: Range subscripts for strings don't handle UTF-8.
|
||||
var a = "123"
|
||||
a[0..-4] // expect runtime error: Range end out of bounds.
|
||||
|
||||
Reference in New Issue
Block a user