New hashing routine.

This commit is contained in:
jjgarcia 2005-08-30 15:44:50 +00:00
parent b5211a4af7
commit 5bad4b0857
3 changed files with 136 additions and 82 deletions

View file

@ -8,6 +8,9 @@ ECL 0.9h
- Reworked the structure of the lexical environment to accelerate access to
variables.
- New hash routine, similar to SBCL's one, faster and leading to fewer
collisions between similar strings.
;;; Local Variables: ***
;;; mode:text ***
;;; fill-column:79 ***

View file

@ -20,109 +20,173 @@
#include <ctype.h>
#include "internal.h"
/*******************
* CRC-32 ROUTINES *
*******************/
/********************
* HASHING ROUTINES *
********************/
/*
* SBCL'S old mashing function. Leads to many collisions.
*/
#if FIXNUM_BITS > 32
#define mash(h,n) ((((h) << 5) | ((h) >> (FIXNUM_BITS - 5))) ^ (n))
#define hash_word(h,x) mash(h,(cl_index)x)
static cl_hashkey
hash_string(const char *buf, cl_index len)
hash_string(cl_hashkey h, const unsigned char *buf, cl_index len)
{
cl_hashkey h;
for (h = 0; len; len--) {
for (; len; len--) {
h = mash(h, (*buf++));
}
return h;
}
#else
/*
* SBCL's newest algorithm. Leads to few collisions, is fast, but
* limited to 32 bits.
*/
#define mix(a,b,c) \
{ \
a -= b; a -= c; a ^= (c>>13); \
b -= c; b -= a; b ^= (a<<8); \
c -= a; c -= b; c ^= (b>>13); \
a -= b; a -= c; a ^= (c>>12); \
b -= c; b -= a; b ^= (a<<16); \
c -= a; c -= b; c ^= (b>>5); \
a -= b; a -= c; a ^= (c>>3); \
b -= c; b -= a; b ^= (a<<10); \
c -= a; c -= b; c ^= (b>>15); \
}
static uint32_t
hash_string(uint32_t initval, const unsigned char *k, cl_index len)
{
uint32_t a = 0, b = 0, c = initval;
for (; len > 12; ) {
a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24));
b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24));
c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24));
mix(a,b,c);
k += 12; len -= 12;
}
/*------------------------------------- handle the last 11 bytes */
c += len;
switch(len) {
/* all the case statements fall through */
case 11: c+=((uint32_t)k[10]<<24);
case 10: c+=((uint32_t)k[9]<<16);
case 9 : c+=((uint32_t)k[8]<<8);
/* the first byte of c is reserved for the length */
case 8 : b+=((uint32_t)k[7]<<24);
case 7 : b+=((uint32_t)k[6]<<16);
case 6 : b+=((uint32_t)k[5]<<8);
case 5 : b+=k[4];
case 4 : a+=((uint32_t)k[3]<<24);
case 3 : a+=((uint32_t)k[2]<<16);
case 2 : a+=((uint32_t)k[1]<<8);
case 1 : a+=k[0];
/* case 0: nothing left to add */
}
mix(a,b,c);
/*-------------------------------------------- report the result */
return c;
}
static uint32_t hash_word(uint32_t c, uint32_t a)
{
uint32_t b = 0;
mix(a, b, c);
return c;
}
#endif
static void corrupted_hash(cl_object hashtable) /*__attribute__((noreturn))*/;
static void
corrupted_hash(cl_object hashtable)
{
FEerror("internal error, corrupted hashtable ~S", 1, hashtable);
FEerror("internal error, corrupted hashtable ~S", 1, hashtable);
}
cl_hashkey
hash_eql(cl_object x)
static cl_hashkey
_hash_eql(cl_hashkey h, cl_object x)
{
BEGIN:
switch (type_of(x)) {
case t_bignum:
return hash_string((char*)x->big.big_limbs,
return hash_string(h, (unsigned char*)x->big.big_limbs,
labs(x->big.big_size) * sizeof(mp_limb_t));
break;
case t_ratio: {
cl_hashkey h = hash_eql(x->ratio.num);
return mash(h, hash_eql(x->ratio.den));
}
case t_ratio:
h = _hash_eql(h, x->ratio.num);
return _hash_eql(h, x->ratio.den);
case t_shortfloat:
return hash_string((char*)&sf(x), sizeof(sf(x)));
return hash_string(h, (unsigned char*)&sf(x), sizeof(sf(x)));
case t_longfloat:
return hash_string((char*)&lf(x), sizeof(lf(x)));
case t_complex: {
cl_hashkey h = hash_eql(x->complex.real);
return mash(h, hash_eql(x->complex.imag));
}
return hash_string(h, (unsigned char*)&lf(x), sizeof(lf(x)));
case t_complex:
h = _hash_eql(h, x->complex.real);
return _hash_eql(h, x->complex.imag);
case t_character:
return CHAR_CODE(x);
return hash_word(h, CHAR_CODE(x));
default:
return (cl_hashkey)x >> 2;
return hash_word(h, ((cl_hashkey)x >> 2));
}
}
static cl_hashkey
_hash_equal(int depth, cl_object x)
_hash_equal(int depth, cl_hashkey h, cl_object x)
{
switch (type_of(x)) {
case t_cons:
if (depth++ > 3) {
return 0;
} else {
cl_hashkey h = _hash_equal(depth, CAR(x));
return mash(h, _hash_equal(depth, CDR(x)));
}
h = _hash_equal(depth, h, CAR(x));
return _hash_equal(depth, h, CDR(x));
case t_symbol:
x = x->symbol.name;
case t_string:
return hash_string(x->string.self, x->string.fillp);
case t_pathname: {
cl_hashkey h = _hash_equal(depth, x->pathname.host);
h = mash(h, _hash_equal(depth, x->pathname.device));
h = mash(h, _hash_equal(depth, x->pathname.directory));
h = mash(h, _hash_equal(depth, x->pathname.name));
h = mash(h, _hash_equal(depth, x->pathname.type));
return mash(h, _hash_equal(depth, x->pathname.name));
}
return hash_string(h, x->string.self, x->string.fillp);
case t_pathname:
h = _hash_equal(depth, h, x->pathname.host);
h = _hash_equal(depth, h, x->pathname.device);
h = _hash_equal(depth, h, x->pathname.directory);
h = _hash_equal(depth, h, x->pathname.name);
h = _hash_equal(depth, h, x->pathname.type);
return _hash_equal(depth, h, x->pathname.name);
case t_random:
return x->random.value;
return hash_word(h, x->random.value);
case t_bitvector:
/* Notice that we may round out some bits. We must do this
* because the fill pointer may be set in the middle of a byte.
* If so, the extra bits _must_ _not_ take part in the hash,
* because otherwise we two bit arrays which are EQUAL might
* have different hash keys. */
return hash_string(x->vector.self.ch, x->vector.fillp / 8);
return hash_string(h, x->vector.self.ch, x->vector.fillp / 8);
default:
return hash_eql(x);
return _hash_eql(h, x);
}
}
static cl_hashkey
_hash_equalp(int depth, cl_object x)
_hash_equalp(int depth, cl_hashkey h, cl_object x)
{
cl_index i, len;
switch (type_of(x)) {
case t_character:
return toupper(CHAR_CODE(x));
return hash_word(h, toupper(CHAR_CODE(x)));
case t_cons:
if (depth++ > 3) {
return 0;
} else {
cl_hashkey h = _hash_equalp(depth, CAR(x));
return mash(h, _hash_equalp(depth, CDR(x)));
}
h = _hash_equalp(depth, h, CAR(x));
return _hash_equalp(depth, h, CDR(x));
case t_string:
case t_vector:
case t_bitvector:
@ -132,46 +196,36 @@ _hash_equalp(int depth, cl_object x)
len = x->vector.dim;
SCAN: if (depth++ >= 3) {
return 0;
} else {
cl_hashkey h = 0;
for (i = 0; i < len; i++) {
h = mash(h,_hash_equalp(depth, aref(x, i)));
}
return h;
}
break;
for (i = 0; i < len; i++) {
h = _hash_equalp(depth, h, aref(x, i));
}
return h;
case t_fixnum:
return fix(x);
return hash_word(h, fix(x));
case t_shortfloat:
/* FIXME! We should be more precise here! */
return (cl_index)sf(x);
return hash_word(h, (cl_index)sf(x));
case t_longfloat:
/* FIXME! We should be more precise here! */
return (cl_index)lf(x);
return hash_word(h, (cl_index)lf(x));
case t_bignum:
/* FIXME! We should be more precise here! */
case t_ratio: {
cl_hashkey h = _hash_equalp(depth, x->ratio.num);
return mash(h, _hash_equalp(depth, x->ratio.den));
}
case t_complex: {
cl_hashkey h = _hash_equalp(depth, x->complex.real);
return mash(h, _hash_equalp(depth, x->complex.imag));
}
case t_ratio:
h = _hash_equalp(depth, h, x->ratio.num);
return _hash_equalp(depth, h, x->ratio.den);
case t_complex:
h = _hash_equalp(depth, h, x->complex.real);
return _hash_equalp(depth, h, x->complex.imag);
case t_instance:
case t_hashtable:
return 42;
/* FIXME! We should be more precise here! */
return hash_word(h, 42);
default:
return _hash_equal(depth, x);
return _hash_equal(depth, h, x);
}
}
cl_hashkey
hash_equal(cl_object key)
{
return _hash_equal(0, key);
}
struct ecl_hashtable_entry *
ecl_search_hash(cl_object key, cl_object hashtable)
{
@ -187,10 +241,10 @@ ecl_search_hash(cl_object key, cl_object hashtable)
j = hsize;
switch (htest) {
case htt_eq: h = (cl_hashkey)key >> 2; break;
case htt_eql: h = hash_eql(key); break;
case htt_equal: h = _hash_equal(0, key); break;
case htt_equalp:h = _hash_equalp(0, key); break;
case htt_pack: h = _hash_equal(0, key);
case htt_eql: h = _hash_eql(0, key); break;
case htt_equal: h = _hash_equal(0, 0, key); break;
case htt_equalp:h = _hash_equalp(0, 0, key); break;
case htt_pack: h = _hash_equal(0, 0, key);
ho = MAKE_FIXNUM(h & 0xFFFFFFF);
break;
default: corrupted_hash(hashtable);
@ -267,10 +321,10 @@ add_new_to_hash(cl_object key, cl_object hashtable, cl_object value)
hsize = hashtable->hash.size;
switch (htest) {
case htt_eq: h = (cl_hashkey)key >> 2; break;
case htt_eql: h = hash_eql(key); break;
case htt_equal: h = _hash_equal(0, key); break;
case htt_equalp:h = _hash_equalp(0, key); break;
case htt_pack: h = _hash_equal(0, key); break;
case htt_eql: h = _hash_eql(0, key); break;
case htt_equal: h = _hash_equal(0, 0, key); break;
case htt_equalp:h = _hash_equalp(0, 0, key); break;
case htt_pack: h = _hash_equal(0, 0, key); break;
default: corrupted_hash(hashtable);
}
e = hashtable->hash.data;
@ -572,7 +626,7 @@ cl_hash_table_rehash_threshold(cl_object ht)
cl_object
cl_sxhash(cl_object key)
{
cl_index output = _hash_equal(0, key);
cl_index output = _hash_equal(0, 0, key);
const cl_index mask = ((cl_index)1 << (FIXNUM_BITS - 3)) - 1;
@(return MAKE_FIXNUM(output & mask))
}

View file

@ -636,9 +636,6 @@ extern cl_object cl_make_hash_table _ARGS((cl_narg narg, ...));
extern cl_object cl_gethash _ARGS((cl_narg narg, cl_object key, cl_object ht, ...));
extern cl_object si_copy_hash_table(cl_object orig);
extern cl_hashkey hash_eq(cl_object x);
extern cl_hashkey hash_eql(cl_object x);
extern cl_hashkey hash_equal(cl_object x);
extern void sethash(cl_object key, cl_object hashtable, cl_object value);
extern cl_object gethash(cl_object key, cl_object hash);
extern cl_object gethash_safe(cl_object key, cl_object hash, cl_object def);