From 16faca09b62496a1f4ac3900371a06edda979c2e Mon Sep 17 00:00:00 2001 From: Marius Gerbershagen Date: Sun, 6 Feb 2022 20:15:36 +0100 Subject: [PATCH] hash-tables: performance improvements Previously, we used the "tombstones" approach for removing entries from a hash-table, that is buckets were marked as previously occupied but currently empty. If the hash-table did not grow in size these tombstones were never cleaned up, only reused for new entries. For a general purpose hash-table this is a problematic strategy because there are workloads where this significantly slows down any attempted lookup or deletion for elements that are not contained in the hash-table. This happens when new elements are added and deleted regularly for a long time without the size of the hash-table changing. Even if the hash-table is never filled above a small fraction of its size, tombstones will eventually accumulate which an unsuccessfull search for an element will have to skip over. In the new implementation, we don't use any tombstones at all but instead shift elements into more optimal buckets. In the case where the hash-table in the old approach was free of tombstones, the new approach is a little slower for the process of deleting from highly occupied hash-tables itself but inserting into a hash-table is a little bit faster on the other hand. --- src/c/hash.d | 323 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 197 insertions(+), 126 deletions(-) diff --git a/src/c/hash.d b/src/c/hash.d index f580b72f5..e231576fd 100644 --- a/src/c/hash.d +++ b/src/c/hash.d @@ -276,28 +276,15 @@ static cl_hashkey _hash_generic(cl_object ht, cl_object key) { } #define HASH_TABLE_LOOP(hkey,hvalue,h,HASH_TABLE_LOOP_TEST) { \ - cl_index hsize = hashtable->hash.size; \ - cl_index i = h % hsize, j = hsize, k; \ - for (k = 0; k < hsize; i = (i + 1) % hsize, k++) { \ + cl_index i, hsize = hashtable->hash.size; \ + /* INV: there is at least one empty bucket so that this loop will + * terminate */ \ + for (i = h % hsize; ; i = (i + 1) % hsize) { \ struct ecl_hashtable_entry *e = hashtable->hash.data + i; \ cl_object hkey = e->key, hvalue = e->value; \ - if (hkey == OBJNULL) { \ - if (hvalue == OBJNULL) { \ - if (j == hsize) \ - return e; \ - else \ - return hashtable->hash.data + j; \ - } else { \ - if (j == hsize) \ - j = i; \ - else if (j == i) \ - return e; \ - } \ - continue; \ - } \ - if (HASH_TABLE_LOOP_TEST) return hashtable->hash.data + i; \ + (void)hvalue; /* silence unused-variable compiler warning */ \ + if (hkey == OBJNULL || (HASH_TABLE_LOOP_TEST)) return e; \ } \ - return hashtable->hash.data + j; \ } #define HASH_TABLE_SET(h,loop,compute_key,store_key) { \ @@ -318,6 +305,52 @@ AGAIN: \ return hashtable; \ } +/* HASH_TABLE_REMOVE tries to fills up holes generated by deleting + * entries from a hashtable as follows. Iterate through all entries f + * to the right of the deleted entry e (the hole). If the distance + * between f's current and its optimal location is greater than the + * distance between e and f, then we can put f into the hole. Repeat + * with the new hole at the location of f until the holes are all + * filled. */ + +#define HASH_TABLE_REMOVE(hkey,hvalue,h,HASH_TABLE_LOOP_TEST,compute_key) { \ + cl_index i, hsize = hashtable->hash.size; \ + /* INV: there is at least one empty bucket so that this loop will + * terminate */ \ + for (i = h % hsize; ; i = (i + 1) % hsize) { \ + struct ecl_hashtable_entry *e = hashtable->hash.data + i; \ + cl_object hkey = e->key, hvalue = e->value; \ + (void)hvalue; /* silence unused-variable compiler warning */ \ + if (hkey == OBJNULL) return 0; \ + if (HASH_TABLE_LOOP_TEST) { \ + cl_index j = (i+1) % hsize, k; \ + for (k = 1; k <= hsize; j = (j+1) % hsize, k++) { \ + struct ecl_hashtable_entry *f = hashtable->hash.data + j; \ + hkey = f->key; \ + hvalue = f->value; \ + if (hkey == OBJNULL) { \ + e->key = OBJNULL; \ + e->value = OBJNULL; \ + break; \ + } \ + cl_hashkey hf = compute_key; \ + cl_index m = hf % hsize; \ + /* d: distance of f from the optimal position */ \ + cl_index d = (j >= m) ? (j - m) : (j + hsize - m); \ + if (k <= d) { \ + e->key = hkey; \ + e->value = hvalue; \ + e = f; \ + i = j; \ + k = 0; \ + } \ + } \ + hashtable->hash.entries--; \ + return 1; \ + } \ + } \ +} + /* * EQ HASHTABLES */ @@ -338,27 +371,18 @@ _ecl_gethash_eq(cl_object key, cl_object hashtable, cl_object def) return (e->key == OBJNULL)? def : e->value; } -static bool -_ecl_remhash_eq(cl_object key, cl_object hashtable) -{ - cl_hashkey h = _hash_eq(key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_eq(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } -} - static cl_object _ecl_sethash_eq(cl_object key, cl_object hashtable, cl_object value) { HASH_TABLE_SET(h, _ecl_hash_loop_eq, _hash_eq(key), key); } +static bool +_ecl_remhash_eq(cl_object key, cl_object hashtable) +{ + HASH_TABLE_REMOVE(hkey, hvalue, _hash_eq(key), key == hkey, _hash_eq(hkey)); +} + /* * EQL HASHTABLES */ @@ -386,16 +410,7 @@ _ecl_sethash_eql(cl_object key, cl_object hashtable, cl_object value) static bool _ecl_remhash_eql(cl_object key, cl_object hashtable) { - cl_hashkey h = _hash_eql(0, key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_eql(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } + HASH_TABLE_REMOVE(hkey, hvalue, _hash_eql(0, key), ecl_eql(key, hkey), _hash_eql(0, hkey)); } /* @@ -425,16 +440,7 @@ _ecl_sethash_equal(cl_object key, cl_object hashtable, cl_object value) static bool _ecl_remhash_equal(cl_object key, cl_object hashtable) { - cl_hashkey h = _hash_equal(3, 0, key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_equal(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } + HASH_TABLE_REMOVE(hkey, hvalue, _hash_equal(3, 0, key), ecl_equal(key, hkey), _hash_equal(3, 0, hkey)); } /* @@ -464,16 +470,7 @@ _ecl_sethash_equalp(cl_object key, cl_object hashtable, cl_object value) static bool _ecl_remhash_equalp(cl_object key, cl_object hashtable) { - cl_hashkey h = _hash_equalp(3, 0, key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_equalp(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } + HASH_TABLE_REMOVE(hkey, hvalue, _hash_equalp(3, 0, key), ecl_equalp(key, hkey), _hash_equalp(3, 0, hkey)); } /* @@ -505,15 +502,8 @@ static bool _ecl_remhash_pack(cl_object key, cl_object hashtable) { cl_hashkey h = _hash_equal(3, 0, key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_pack(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } + cl_object ho = ecl_make_fixnum(h & 0xFFFFFFF); + HASH_TABLE_REMOVE(hkey, hvalue, h, (ho==hkey) && ecl_string_eq(key,SYMBOL_NAME(hvalue)), _hash_equal(3, 0, SYMBOL_NAME(hvalue))); } /* @@ -549,16 +539,8 @@ _ecl_sethash_generic(cl_object key, cl_object hashtable, cl_object value) static bool _ecl_remhash_generic(cl_object key, cl_object hashtable) { - cl_hashkey h = _hash_generic(hashtable, key); - struct ecl_hashtable_entry *e = _ecl_hash_loop_generic(h, key, hashtable); - if (e->key == OBJNULL) { - return 0; - } else { - e->key = OBJNULL; - e->value = ECL_NIL; - hashtable->hash.entries--; - return 1; - } + cl_object test_fun = hashtable->hash.generic_test; + HASH_TABLE_REMOVE(hkey, hvalue, _hash_generic(hashtable, key), _ecl_generic_hash_test(test_fun, key, hkey), _hash_generic(hashtable, hkey)); } /* @@ -658,26 +640,12 @@ static struct ecl_hashtable_entry * _ecl_weak_hash_loop(cl_hashkey h, cl_object key, cl_object hashtable, struct ecl_hashtable_entry *aux) { - cl_index hsize = hashtable->hash.size; - cl_index i = h % hsize, j = hsize, k; - for (k = 0; k < hsize; i = (i + 1) % hsize, k++) { + cl_index i, hsize = hashtable->hash.size; + for (i = h % hsize; ; i = (i + 1) % hsize) { struct ecl_hashtable_entry *p = hashtable->hash.data + i; struct ecl_hashtable_entry e = *aux = copy_entry(p, hashtable); if (e.key == OBJNULL) { - if (e.value == OBJNULL) { - if (j == hsize) { - return p; - } else { - return hashtable->hash.data + j; - } - } else { - if (j == hsize) { - j = i; - } else if (j == i) { - return p; - } - } - continue; + return p; } switch (hashtable->hash.test) { case ecl_htt_eq: @@ -704,7 +672,6 @@ _ecl_weak_hash_loop(cl_hashkey h, cl_object key, cl_object hashtable, ecl_internal_error("Unknown hash test."); } } - return hashtable->hash.data + j; } static cl_object @@ -759,17 +726,75 @@ _ecl_sethash_weak(cl_object key, cl_object hashtable, cl_object value) static bool _ecl_remhash_weak(cl_object key, cl_object hashtable) { + cl_index i, hsize = hashtable->hash.size; cl_hashkey h = _ecl_hash_key(hashtable, key); - struct ecl_hashtable_entry aux[1]; - struct ecl_hashtable_entry *e = - _ecl_weak_hash_loop(h, key, hashtable, aux); - if (aux->key != OBJNULL) { - hashtable->hash.entries--; - e->key = OBJNULL; - e->value = ECL_NIL; - return 1; - } else { - return 0; + for (i = h % hsize; ; i = (i + 1) % hsize) { + struct ecl_hashtable_entry *p = hashtable->hash.data + i; + struct ecl_hashtable_entry e = copy_entry(p, hashtable); + if (e.key == OBJNULL) { + return 0; + } + bool found = FALSE; + switch (hashtable->hash.test) { + case ecl_htt_eq: + if (e.key == key) + found = TRUE; + break; + case ecl_htt_eql: + if (ecl_eql(e.key, key)) + found = TRUE; + break; + case ecl_htt_equal: + if (ecl_equal(e.key, key)) + found = TRUE; + break; + case ecl_htt_equalp: + if (ecl_equalp(e.key, key)) + found = TRUE; + break; + case ecl_htt_generic: + if (_ecl_generic_hash_test(hashtable->hash.generic_test, e.key, key)) + found = TRUE; + break; + default: + ecl_internal_error("Unknown hash test."); + } + if (found) { + cl_index j = (i+1) % hsize, k; + for (k = 1; k <= hsize; j = (j+1) % hsize, k++) { + struct ecl_hashtable_entry *q = hashtable->hash.data + j; + struct ecl_hashtable_entry f = copy_entry(q, hashtable); + if (f.key == OBJNULL) { + break; + } + cl_hashkey hf = _ecl_hash_key(hashtable, f.value); + cl_index m = hf % hsize; + cl_index d = (j >= m) ? (j - m) : (j + hsize - m); + if (k <= d) { + switch (hashtable->hash.weak) { + case ecl_htt_weak_key: + case ecl_htt_weak_key_and_value: + case ecl_htt_weak_key_or_value: + p->key = si_make_weak_pointer(f.key); + default: + p->key = f.key; + } + switch (hashtable->hash.weak) { + case ecl_htt_weak_value: + case ecl_htt_weak_key_and_value: + case ecl_htt_weak_key_or_value: + p->value = si_make_weak_pointer(f.value); + default: + p->value = f.value; + } + p = q; + i = j; + k = 0; + } + } + hashtable->hash.entries--; + return 1; + } } } #endif @@ -889,6 +914,9 @@ ecl_extend_hashtable(cl_object hashtable) new->hash.entries = 0; new->hash.size = new_size; new->hash.limit = new->hash.size * new->hash.factor; + if (new->hash.limit >= new_size) { + new->hash.limit = new_size - 1; + } new->hash.data = (struct ecl_hashtable_entry *) ecl_alloc(new_size * sizeof(struct ecl_hashtable_entry)); for (i = 0; i < new_size; i++) { @@ -1089,6 +1117,9 @@ cl__make_hash_table(cl_object test, cl_object size, cl_object rehash_size, rehash_threshold = cl_max(2, min_threshold, rehash_threshold); h->hash.factor = ecl_to_double(rehash_threshold); h->hash.limit = h->hash.size * h->hash.factor; + if (h->hash.limit >= hsize) { + h->hash.limit = hsize - 1; + } h->hash.data = NULL; /* for GC sake */ h->hash.data = (struct ecl_hashtable_entry *) ecl_alloc(hsize * sizeof(struct ecl_hashtable_entry)); @@ -1281,27 +1312,54 @@ cl_hash_table_count(cl_object ht) @(return (ecl_make_fixnum(ecl_hash_table_count(ht)))); } + +/* HASH TABLE ITERATION + * + * We iterate from right to left across each group of consecutive + * non-empty buckets. This allows removing the current iteration + * element without iterating over elements twice or missing elements + * because we only change elements to the right of the current + * element when removing an element. For example, a hashtable of + * size 10 with 5 filled buckets is iterated over as follows: + * + * a..bc...de + * ^ + * a..bc...de + * ^ + * a..bc...de + * ^ + * a..bc...de + * ^ + * a..bc...de + * ^ + * + * If for example the element `e` is removed and the element `a` moves + * up at the place that `e` previously occupied, we don't iterate + * twice over `a`. + */ + static cl_object si_hash_table_iterate(cl_narg narg, ...) { const cl_env_ptr the_env = ecl_process_env(); cl_object env = the_env->function->cclosure.env; cl_object index = CAR(env); - cl_object ht = CADR(env); - cl_fixnum i; + cl_object endpoint = CADR(env); + cl_object ht = CADDR(env); + cl_fixnum i, j; if (!Null(index)) { - i = ecl_fixnum(index); - if (i < 0) - i = -1; - for (; ++i < ht->hash.size; ) { + i = ecl_fixnum(endpoint); + j = ecl_fixnum(index); + do { + j = (j == 0) ? ht->hash.size-1 : j - 1; struct ecl_hashtable_entry e = - copy_entry(ht->hash.data + i, ht); + copy_entry(ht->hash.data + j, ht); if (e.key != OBJNULL) { - cl_object ndx = ecl_make_fixnum(i); + cl_object ndx = ecl_make_fixnum(j); ECL_RPLACA(env, ndx); @(return ndx e.key e.value); } - } + } while (j != i); ECL_RPLACA(env, ECL_NIL); } @(return ECL_NIL); @@ -1310,9 +1368,13 @@ si_hash_table_iterate(cl_narg narg, ...) cl_object si_hash_table_iterator(cl_object ht) { + cl_fixnum i; assert_type_hash_table(@[si::hash-table-iterator], 1, ht); + /* Make sure we don't start in the middle of a group of consecutive + * filled buckets. */ + for (i = ht->hash.size-1; ht->hash.data[i].key != OBJNULL; i--); @(return ecl_make_cclosure_va(si_hash_table_iterate, - cl_list(2, ecl_make_fixnum(-1), ht), + cl_list(3, ecl_make_fixnum(i), ecl_make_fixnum(i), ht), @'si::hash-table-iterator', 0)); } @@ -1372,12 +1434,21 @@ cl_sxhash(cl_object key) cl_object cl_maphash(cl_object fun, cl_object ht) { - cl_index i; + cl_index i, j, hsize; assert_type_hash_table(@[maphash], 2, ht); - for (i = 0; i < ht->hash.size; i++) { - struct ecl_hashtable_entry e = ht->hash.data[i]; - if(e.key != OBJNULL) { + if (ht->hash.entries == 0) { + @(return ECL_NIL); + } + hsize = ht->hash.size; + /* Make sure we don't start in the middle of a group of consecutive + * filled buckets. */ + for (i = hsize-1; ht->hash.data[i].key != OBJNULL; i--); + j = i; + do { + j = (j == 0) ? hsize-1 : j - 1; + struct ecl_hashtable_entry e = ht->hash.data[j]; + if (e.key != OBJNULL) { cl_object key = e.key; cl_object val = e.value; switch (ht->hash.weak) { @@ -1394,7 +1465,7 @@ cl_maphash(cl_object fun, cl_object ht) } funcall(3, fun, key, val); } - } + } while (j != i); @(return ECL_NIL); }