1
0
forked from Mirror/wren

Store hash code in strings.

Makes string equality and string map keys much faster.
Also did some other general string clean-up.
This commit is contained in:
Bob Nystrom
2015-03-18 07:09:03 -07:00
parent b80ba29b0e
commit be11d09bd8
14 changed files with 89 additions and 52 deletions

View File

@ -75,6 +75,8 @@ BENCHMARK("map_numeric", r"""500000500000""")
BENCHMARK("map_string", r"""3645600""")
BENCHMARK("string_equals", r"""3000000""")
LANGUAGES = [
("wren", [os.path.join(WREN_DIR, 'wren')], ".wren"),
("lua", ["lua"], ".lua"),

View File

@ -1401,6 +1401,7 @@ DEF_PRIMITIVE(string_subscript)
}
// TODO: Handle UTF-8 here.
/*
int step;
int count = string->length;
int start = calculateRange(vm, args, AS_RANGE(args[1]), &count, &step);
@ -1414,6 +1415,8 @@ DEF_PRIMITIVE(string_subscript)
result->value[count] = '\0';
RETURN_OBJ(result);
*/
RETURN_ERROR("Subscript ranges for strings are not implemented yet.");
}
static ObjClass* defineSingleClass(WrenVM* vm, const char* name)

View File

@ -357,29 +357,7 @@ static uint32_t hashObject(Obj* object)
}
case OBJ_STRING:
{
ObjString* string = (ObjString*)object;
// FNV-1a hash. See: http://www.isthe.com/chongo/tech/comp/fnv/
uint32_t hash = 2166136261u;
// We want the contents of the string to affect the hash, but we also
// want to ensure it runs in constant time. We also don't want to bias
// towards the prefix or suffix of the string. So sample up to eight
// characters spread throughout the string.
// TODO: Tune this.
if (string->length > 0)
{
uint32_t step = 1 + 7 / string->length;
for (uint32_t i = 0; i < string->length; i += step)
{
hash ^= string->value[i];
hash *= 16777619;
}
}
return hash;
}
return ((ObjString*)object)->hash;
default:
ASSERT(false, "Only immutable objects can be hashed.");
@ -616,32 +594,61 @@ Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive)
return OBJ_VAL(range);
}
// Creates a new string object with a null-terminated buffer large enough to
// hold a string of [length] but does not fill in the bytes.
//
// The caller is expected to fill in the buffer and then calculate the string's
// hash.
static ObjString* allocateString(WrenVM* vm, size_t length)
{
ObjString* string = ALLOCATE_FLEX(vm, ObjString, char, length + 1);
initObj(vm, &string->obj, OBJ_STRING, vm->stringClass);
string->length = (int)length;
string->value[length] = '\0';
return string;
}
// Calculates and stores the hash code for [string].
static void hashString(ObjString* string)
{
// FNV-1a hash. See: http://www.isthe.com/chongo/tech/comp/fnv/
uint32_t hash = 2166136261u;
// We want the contents of the string to affect the hash, but we also
// want to ensure it runs in constant time. We also don't want to bias
// towards the prefix or suffix of the string. So sample up to eight
// characters spread throughout the string.
// TODO: Tune this.
if (string->length > 0)
{
uint32_t step = 1 + 7 / string->length;
for (uint32_t i = 0; i < string->length; i += step)
{
hash ^= string->value[i];
hash *= 16777619;
}
}
string->hash = hash;
}
Value wrenNewString(WrenVM* vm, const char* text, size_t length)
{
// Allow NULL if the string is empty since byte buffers don't allocate any
// characters for a zero-length string.
ASSERT(length == 0 || text != NULL, "Unexpected NULL string.");
// TODO: Don't allocate a heap string at all for zero-length strings.
ObjString* string = wrenNewUninitializedString(vm, length);
ObjString* string = allocateString(vm, length);
// Copy the string (if given one).
if (length > 0) memcpy(string->value, text, length);
string->value[length] = '\0';
hashString(string);
return OBJ_VAL(string);
}
ObjString* wrenNewUninitializedString(WrenVM* vm, size_t length)
{
ObjString* string = ALLOCATE_FLEX(vm, ObjString, char, length + 1);
initObj(vm, &string->obj, OBJ_STRING, vm->stringClass);
string->length = (int)length;
return string;
}
Value wrenNumToString(WrenVM* vm, double value)
{
// Corner case: If the value is NaN, different versions of libc produce
@ -700,7 +707,7 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
va_end(argList);
// Concatenate the string.
ObjString* result = wrenNewUninitializedString(vm, totalLength);
ObjString* result = allocateString(vm, totalLength);
va_start(argList, format);
char* start = result->value;
@ -732,7 +739,7 @@ Value wrenStringFormat(WrenVM* vm, const char* format, ...)
}
va_end(argList);
*start = '\0';
hashString(result);
return OBJ_VAL(result);
}
@ -753,10 +760,7 @@ Value wrenStringCodePointAt(WrenVM* vm, ObjString* string, uint32_t index)
else if ((first & 0xe0) == 0xc0) numBytes = 2;
else numBytes = 1;
ObjString* result = wrenNewUninitializedString(vm, numBytes);
memcpy(result->value, string->value + index, numBytes);
result->value[numBytes] = '\0';
return OBJ_VAL(result);
return wrenNewString(vm, string->value + index, numBytes);
}
// Uses the Boyer-Moore-Horspool string matching algorithm.
@ -1135,6 +1139,7 @@ bool wrenValuesEqual(Value a, Value b)
ObjString* aString = (ObjString*)aObj;
ObjString* bString = (ObjString*)bObj;
return aString->length == bString->length &&
aString->hash == bString->hash &&
memcmp(aString->value, bString->value, aString->length) == 0;
}

View File

@ -109,6 +109,7 @@ typedef struct
Obj obj;
// Does not include the null terminator.
uint32_t length;
uint32_t hash;
char value[FLEXIBLE_ARRAY];
} ObjString;
@ -485,6 +486,10 @@ typedef struct
// Returns true if [value] is a string object.
#define IS_STRING(value) (wrenIsObjType(value, OBJ_STRING))
// Creates a new string object from [text], which should be a bare C string
// literal. This determines the length of the string automatically at compile
// time based on the size of the character array -1 for the terminating '\0'.
#define CONST_STRING(vm, text) wrenNewString((vm), (text), sizeof(text) - 1)
// An IEEE 754 double-precision float is a 64-bit value with bits laid out like:
//
@ -697,22 +702,11 @@ ObjModule* wrenNewModule(WrenVM* vm);
// Creates a new range from [from] to [to].
Value wrenNewRange(WrenVM* vm, double from, double to, bool isInclusive);
// Creates a new string object from [text], which should be a bare C string
// literal. This determines the length of the string automatically at compile
// time based on the size of the character array -1 for the terminating '\0'.
#define CONST_STRING(vm, text) wrenNewString((vm), (text), sizeof(text) - 1)
// Creates a new string object of [length] and copies [text] into it.
//
// [text] may be NULL if [length] is zero.
Value wrenNewString(WrenVM* vm, const char* text, size_t length);
// Creates a new string object with a buffer large enough to hold a string of
// [length] but does no initialization of the buffer.
//
// The caller is expected to fully initialize the buffer after calling.
ObjString* wrenNewUninitializedString(WrenVM* vm, size_t length);
// Produces a string representation of [value].
Value wrenNumToString(WrenVM* vm, double value);

View File

@ -0,0 +1,24 @@
var start = IO.clock
var count = 0
for (i in 1..1000000) {
if ("abc" == "abc") count = count + 1
if ("a slightly longer string" ==
"a slightly longer string") count = count + 1
if ("a significantly longer string but still not overwhelmingly long string" ==
"a significantly longer string but still not overwhelmingly long string") count = count + 1
if ("" == "abc") count = count + 1
if ("abc" == "abcd") count = count + 1
if ("changed one character" == "changed %ne character") count = count + 1
if ("123" == 123) count = count + 1
if ("a slightly longer string" ==
"a slightly longer string!") count = count + 1
if ("a slightly longer string" ==
"a slightly longer strinh") count = count + 1
if ("a significantly longer string but still not overwhelmingly long string" ==
"another") count = count + 1
}
IO.print(count)
IO.print("elapsed: ", IO.clock - start)

View File

@ -1,3 +1,4 @@
// skip: Range subscripts for strings don't handle UTF-8.
var string = "abcde"
IO.print(string[0..0]) // expect: a
IO.print(string[1...1] == "") // expect: true

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "string"
a[1.5..2] // expect runtime error: Range start must be an integer.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[3..2] // expect runtime error: Range start out of bounds.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[-4..2] // expect runtime error: Range start out of bounds.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[1...4] // expect runtime error: Range end out of bounds.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[0...-5] // expect runtime error: Range end out of bounds.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "string"
a[1..2.5] // expect runtime error: Range end must be an integer.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[1..3] // expect runtime error: Range end out of bounds.

View File

@ -1,2 +1,3 @@
// skip: Range subscripts for strings don't handle UTF-8.
var a = "123"
a[0..-4] // expect runtime error: Range end out of bounds.