From 5c138480d03ba6f2dedfd3c5ce68fb8a05406f6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Kochma=C5=84ski?= <daniel@turtleware.eu>
Date: Thu, 31 Oct 2024 15:31:03 +0100
Subject: [PATCH] hash: fix numerous issues with weak hash tables

1. We don't normalize garbage collected entries.

Doing that created a false free bucket, so GETHASH and SETHASH stopped searching
for entry too early. Fixes #761.

2. MAPHASH does not map over garbate collected entries.

Previously MAPHASH did not call copy_entry, so it was always accessing key/val
even if they were recently garbage collected, effectively mapping over NIL in
the place of the collected value

3. HASH-TABLE-CONTENT had a similar issue to MAPHASH.

4. REMHASH did fill the hole, but didn't replace the moved entry with a free
bucket, moreover we've used the value to comptue the hash when computing the
distance function.

Moreover we introduce an optimization, where SETHASH first tries to fill a
garbage collected entry in the bucket before it tries to extend the table.
---
 src/c/hash.d | 122 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 72 insertions(+), 50 deletions(-)

diff --git a/src/c/hash.d b/src/c/hash.d
index 08b2a3654..5a1968b40 100644
--- a/src/c/hash.d
+++ b/src/c/hash.d
@@ -553,7 +553,33 @@ _ecl_remhash_generic(cl_object key, cl_object hashtable)
 
 /*
  * WEAK HASH TABLES
+ *
+ * Entries in a weak hash table may disappear without explicit REMHASH. Our
+ * implementation handles collisions with open addressing, that is we put the
+ * colliding element at the first free spot starting from its hash.
+ *
+ * Until recently we've implemented REMHASH by inserting "tombstones", that is
+ * by simply invalidating the entry. Currently after removing the hash we fill
+ * the hole by shifting elements after it to left and that yields some benefits
+ * for scenarios where we frequently add and remove elements.
+ *
+ * Since weak entries may disappear at any time, we need to either fill holes in
+ * GETHASH/SETHASH too, or we need to revert back to inserting "tombstones".
+ * Notice that some functions are common to hash tables with and without weak
+ * entries - for example MAPHASH. These functions assume that entry indexes do
+ * not change while iterating, so we can't shift values in copy_entry unless we
+ * customize these functions too.
+ *
+ * For reasons above weak entries are not normalized to OBJNULL but rather we
+ * leave the weak entry in the same place as a tombstone. SETHASH reuses these
+ * entries while REMHASH behaves the same for all hash tables.
+ *
+ * [key=OBJNULL, value=OBJNULL] - free bucket
+ * [key=ECL_NIL, value=OBJNULL] - tombstone
+ * [key=OBJNULL, value=ECL_NIL] - tombstone copy
+ *
  */
+
 #ifndef ECL_WEAK_HASH
 #define copy_entry(e,h) *(e)
 #endif
@@ -647,6 +673,11 @@ copy_entry(struct ecl_hashtable_entry *e, cl_object h)
 {
   if (e->key == OBJNULL) {
     return *e;
+  } else if (e->value == OBJNULL) {
+    struct ecl_hashtable_entry output = *e;
+    output.key = OBJNULL;
+    output.value = ECL_NIL;
+    return output;
   } else {
     struct ecl_hashtable_entry output = *e;
     switch (h->hash.weak) {
@@ -678,80 +709,84 @@ copy_entry(struct ecl_hashtable_entry *e, cl_object h)
     default:
       return output;
     }
-    h->hash.entries--;
     output.key = OBJNULL;
     output.value = ECL_NIL;
-    return *e = output;
-  }
-}
-
-/* Returns the original entry p, and makes a normalized copy to aux. */
-static struct ecl_hashtable_entry *
-_ecl_weak_hash_loop(cl_hashkey h, cl_object key, cl_object hashtable,
-                    struct ecl_hashtable_entry *aux)
-{
-  cl_index i, hsize = hashtable->hash.size;
-  for (i = h % hsize; ;  i = (i + 1) % hsize) {
-    struct ecl_hashtable_entry *p = hashtable->hash.data + i;
-    struct ecl_hashtable_entry e = *aux = copy_entry(p, hashtable);
-    if (e.key == OBJNULL || _ecl_hash_test(hashtable, key, e.key)) {
-      return p;
-    }
+    e->key = ECL_NIL;
+    e->value = OBJNULL;
+    return output;
   }
 }
 
 static cl_object
 _ecl_gethash_weak(cl_object key, cl_object hashtable, cl_object def)
 {
+  cl_index i, hsize = hashtable->hash.size;
   cl_hashkey h = _ecl_hash_key(hashtable, key);
-  struct ecl_hashtable_entry aux[1];
-  _ecl_weak_hash_loop(h, key, hashtable, aux);
-  if (aux->key == OBJNULL) {
-    return def;
+  struct ecl_hashtable_entry *p, e;
+  for (i = h % hsize; ;  i = (i + 1) % hsize) {
+    p = hashtable->hash.data + i;
+    e = copy_entry(p, hashtable);
+    if (p->key   == OBJNULL) return def;
+    if (p->value == OBJNULL) continue; /* skip the tombstone */
+    if (_ecl_hash_test(hashtable, key, e.key)) return e.value;
   }
-  return aux->value;
 }
 
 static cl_object
 _ecl_sethash_weak(cl_object key, cl_object hashtable, cl_object value)
 {
+  cl_index i, hsize = hashtable->hash.size;
   cl_hashkey h = _ecl_hash_key(hashtable, key);
-  struct ecl_hashtable_entry aux[1];
-  struct ecl_hashtable_entry *e;
+  struct ecl_hashtable_entry e, *p, *f = NULL;
  AGAIN:
-  e = _ecl_weak_hash_loop(h, key, hashtable, aux);
-  if (aux->key == OBJNULL) {
+  for (i = h % hsize; ;  i = (i + 1) % hsize) {
+    p = hashtable->hash.data + i;
+    e = copy_entry(p, hashtable);
+    if (p->key   == OBJNULL) { break; }
+    if (p->value == OBJNULL) { f = p; continue; }
+    if (_ecl_hash_test(hashtable, key, e.key)) {
+      f = p;
+      break;
+    }
+  }
+  if (p->key == OBJNULL && f == NULL) {
     cl_index i = hashtable->hash.entries + 1;
     if (i >= hashtable->hash.limit) {
       hashtable = ecl_extend_hashtable(hashtable);
       goto AGAIN;
     }
     hashtable->hash.entries = i;
-    e->key = _ecl_store_key(hashtable, key);
+    f = p;
   }
-  e->value = _ecl_store_val(hashtable, value);
+  f->key = _ecl_store_key(hashtable, key);
+  f->value = _ecl_store_val(hashtable, value);
   return hashtable;
 }
 
-
 static bool
 _ecl_remhash_weak(cl_object key, cl_object hashtable)
 {
   cl_index i, hsize = hashtable->hash.size;
   cl_hashkey h = _ecl_hash_key(hashtable, key);
+  struct ecl_hashtable_entry *p, e;
   for (i = h % hsize; ;  i = (i + 1) % hsize) {
-    struct ecl_hashtable_entry *p = hashtable->hash.data + i;
-    struct ecl_hashtable_entry e = copy_entry(p, hashtable);
-    if (e.key == OBJNULL) return 0;
+    p = hashtable->hash.data + i;
+    e = copy_entry(p, hashtable);
+    if (p->key == OBJNULL) return 0;
+    /* We could try to shift consecutive tombstones here(!) */
+    if (e.key == OBJNULL) continue;
     if (_ecl_hash_test(hashtable, key, e.key)) {
       cl_index j = (i+1) % hsize, k;
+      struct ecl_hashtable_entry *q, f;
       for (k = 1; k <= hsize; j = (j+1) % hsize, k++) {
-        struct ecl_hashtable_entry *q = hashtable->hash.data + j;
-        struct ecl_hashtable_entry f = copy_entry(q, hashtable);
+        q = hashtable->hash.data + j;
+        f = copy_entry(q, hashtable);
         if (f.key == OBJNULL) {
+          p->key = OBJNULL;
+          p->value = OBJNULL;
           break;
         }
-        cl_hashkey hf = _ecl_hash_key(hashtable, f.value);
+        cl_hashkey hf = _ecl_hash_key(hashtable, f.key);
         cl_index m = hf % hsize;
         cl_index d = (j >= m) ? (j - m) : (j + hsize - m);
         if (k <= d) {
@@ -1418,23 +1453,10 @@ cl_maphash(cl_object fun, cl_object ht)
   j = i;
   do {
     j = (j == 0) ? hsize-1 : j - 1;
-    /* FIXME copy the entry! We don't want map over collected entries. */
-    struct ecl_hashtable_entry e = ht->hash.data[j];
+    struct ecl_hashtable_entry e = copy_entry(ht->hash.data + j, ht);
     if (e.key != OBJNULL) {
       cl_object key = e.key;
       cl_object val = e.value;
-      switch (ht->hash.weak) {
-      case ecl_htt_weak_key:
-        key = si_weak_pointer_value(key);
-        break;
-      case ecl_htt_weak_value:
-        val = si_weak_pointer_value(val);
-        break;
-      case ecl_htt_weak_key_and_value:
-      case ecl_htt_weak_key_or_value:
-        key = si_weak_pointer_value(key);
-        val = si_weak_pointer_value(val);
-      }
       funcall(3, fun, key, val);
     }
   } while (j != i);
@@ -1448,7 +1470,7 @@ si_hash_table_content(cl_object ht)
   cl_object output = ECL_NIL;
   assert_type_hash_table(@[ext::hash-table-content], 2, ht);
   for (i = 0;  i < ht->hash.size;  i++) {
-    struct ecl_hashtable_entry e = ht->hash.data[i];
+    struct ecl_hashtable_entry e = copy_entry(ht->hash.data + i, ht);
     if (e.key != OBJNULL)
       output = ecl_cons(ecl_cons(e.key, e.value), output);
   }