1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
|
/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License
and a copy of the GCC Runtime Library Exception along with this
program; see the files COPYING3 and COPYING.RUNTIME respectively.
If not, see <http://www.gnu.org/licenses/>. */
#ifndef _VTV_SET_H
#define _VTV_SET_H 1
/* Code in this file manages a collection of insert-only sets. We
have only tested the case where Key is uintptr_t, though it
theoretically should work for some other cases. All odd keys are
reserved, and must not be inserted into any of the sets. This code
is intended primarily for sets of pointers, and the code is
optimized for small sets (including size 0 and 1), but regardless
of the set size, insert() and contains() have close to O(1) speed
in practice.
TODO(gpike): fix this comment.
Recommended multithreaded use of a set:
For speed, we want to use a lock-free test for set membership. The
code handles simultaneous reads and inserts, as long as at most one
insertion is in progress at a time. After an insert, other threads
may not immediately "see" the inserted key if they perform a
lock-free read, so we recommend retrying, as explained below.
Also, to make data corruption less likely, we recommend using a
"normal" RW page as well as one or pages that are typically RO
but that can be switched to RW and back as needed. The latter
pages should contain sets. The former should contain a lock, L,
and an int or similar, num_writers. Then, to insert, something
like this would be safe:
o Acquire L.
o Increment num_writers; if that made it 1, change pages to RW.
o Release L.
o while (there are insertions to do in some set, S) {
acquire L;
do some insertions in S;
release L;
}
o Acquire L.
o Decrement num_writers; if that made it 0, change pages to RO.
o Release L.
And to check if the set contains some key, one could use
set.contains(key) ||
({ Acquire L; bool b = set.contains(key); Release L; b; })
In this scheme, the number of threads with reads in progress isn't
tracked, so old sets can never be deleted. In addition, on some
architectures the intentionally racy reads might cause contains()
to return true when it should have returned false. This should be
no problem on x86, and most other machines, where reading or
writing an aligned uintptr_t is atomic. E.g., on those machines,
if *p is 0 and one thread does *p = x while another reads *p, the
read will see either 0 or x.
To make the above easier, the insert_only_hash_sets class provides
an interface to manipulate any number of hash sets. One shouldn't
create objects of that class, as it has no member data and its
methods are static.
So the recommended model is to have a single lock, a single
num_writers variable, and some number of sets. If lock contention
becomes a problem then the sets can be divided into k groups, each
of which has a lock and a num_writers variable; or each set can be
represented as a set of values that equal 0 mod m, a set of values
that equal 1 mod m, ..., plus a set of values that equal m-1 mod m.
However, we expect most or all uses of this code to call contains()
much more frequently than anything else, so lock contention is
likely to be low. */
#include <algorithm>
#ifndef HASHTABLE_STATS
#define HASHTABLE_STATS 0
#endif
#ifndef HASHTABLE_STATS_ATOMIC
#define HASHTABLE_STATS_ATOMIC 0
#endif
#if HASHTABLE_STATS
#if HASHTABLE_STATS_ATOMIC
/* Stat counters, with atomics. */
#include <bits/atomic_word.h>
typedef _Atomic_word _AtomicStatCounter;
void
inc_by (_AtomicStatCounter &stat, int amount)
{
__atomic_add_fetch (&stat, amount, __ATOMIC_ACQ_REL);
}
#else
/* Stat counters, but without atomics. */
typedef int _AtomicStatCounter;
void
inc_by (_AtomicStatCounter& stat, int amount)
{
stat += amount;
}
#endif
/* Number of calls to contains(), insert(), etc. */
extern _AtomicStatCounter stat_insert;
extern _AtomicStatCounter stat_contains;
extern _AtomicStatCounter stat_resize;
extern _AtomicStatCounter stat_create;
/* Sum of set size over all calls to contains(). */
extern _AtomicStatCounter stat_contains_sizes;
/* contains() calls in a set whose capacity is more than 1. */
extern _AtomicStatCounter stat_contains_in_non_trivial_set;
/* Probes in a set whose capacity is more than 1. Ideally, this will
be pretty close to stat_contains_in_non_trivial_set. That will
happen if our hash function is good and/or important keys were
inserted before unimportant keys. */
extern _AtomicStatCounter stat_probes_in_non_trivial_set;
/* number of calls to contains() with size=0, 1, etc. */
extern _AtomicStatCounter stat_contains_size0;
extern _AtomicStatCounter stat_contains_size1;
extern _AtomicStatCounter stat_contains_size2;
extern _AtomicStatCounter stat_contains_size3;
extern _AtomicStatCounter stat_contains_size4;
extern _AtomicStatCounter stat_contains_size5;
extern _AtomicStatCounter stat_contains_size6;
extern _AtomicStatCounter stat_contains_size7;
extern _AtomicStatCounter stat_contains_size8;
extern _AtomicStatCounter stat_contains_size9;
extern _AtomicStatCounter stat_contains_size10;
extern _AtomicStatCounter stat_contains_size11;
extern _AtomicStatCounter stat_contains_size12;
extern _AtomicStatCounter stat_contains_size13_or_more;
extern _AtomicStatCounter stat_grow_from_size0_to_1;
extern _AtomicStatCounter stat_grow_from_size1_to_2;
extern _AtomicStatCounter stat_double_the_number_of_buckets;
extern _AtomicStatCounter stat_insert_key_that_was_already_present;
/* Hash collisions detected during insert_no_resize(). Only counts
hasher(k) == hasher(k'); hasher(k) % tablesize == hasher(k') %
tablesize is not sufficient. Will count collisions that are
detected during table resizes etc., so the same two keys may add to
this stat multiple times. */
extern _AtomicStatCounter stat_insert_found_hash_collision;
#include <string>
struct insert_only_hash_sets_logger
{
static char *
log (char c, char *buf)
{
*buf++ = c;
return buf;
}
static char *
log (const char *s, char *buf)
{ return strcpy (buf, s) + strlen (s); }
static char *
log (_AtomicStatCounter i, char *buf)
{
if (i < 10)
return log ((char) ('0' + i), buf);
else
return log ((char) ('0' + i % 10), log (i / 10, buf));
}
static char *
log (const char *label, _AtomicStatCounter i, char *buf)
{
buf = log (label, buf);
buf = log (": ", buf);
buf = log (i, buf);
return log ('\n', buf);
}
};
// Write stats to the given buffer, which should be at least 4000 bytes.
static inline void
insert_only_hash_tables_stats (char *buf)
{
buf = insert_only_hash_sets_logger::log ("insert", stat_insert, buf);
buf = insert_only_hash_sets_logger::log ("contains", stat_contains, buf);
buf = insert_only_hash_sets_logger::log ("resize", stat_resize, buf);
buf = insert_only_hash_sets_logger::log ("create", stat_create, buf);
buf = insert_only_hash_sets_logger::log ("insert_key_that_was_already_"
"present",
stat_insert_key_that_was_already_present,
buf);
buf = insert_only_hash_sets_logger::log ("contains_sizes",
stat_contains_sizes, buf);
buf = insert_only_hash_sets_logger::log ("contains_in_non_trivial_set",
stat_contains_in_non_trivial_set,
buf);
buf = insert_only_hash_sets_logger::log ("probes_in_non_trivial_set",
stat_probes_in_non_trivial_set,
buf);
buf = insert_only_hash_sets_logger::log ("contains_size0",
stat_contains_size0, buf);
buf = insert_only_hash_sets_logger::log ("contains_size1",
stat_contains_size1, buf);
buf = insert_only_hash_sets_logger::log ("contains_size2",
stat_contains_size2, buf);
buf = insert_only_hash_sets_logger::log ("contains_size3",
stat_contains_size3, buf);
buf = insert_only_hash_sets_logger::log ("contains_size4",
stat_contains_size4, buf);
buf = insert_only_hash_sets_logger::log ("contains_size5",
stat_contains_size5, buf);
buf = insert_only_hash_sets_logger::log ("contains_size6",
stat_contains_size6, buf);
buf = insert_only_hash_sets_logger::log ("contains_size7",
stat_contains_size7, buf);
buf = insert_only_hash_sets_logger::log ("contains_size8",
stat_contains_size8, buf);
buf = insert_only_hash_sets_logger::log ("contains_size9",
stat_contains_size9, buf);
buf = insert_only_hash_sets_logger::log ("contains_size10",
stat_contains_size10, buf);
buf = insert_only_hash_sets_logger::log ("contains_size11",
stat_contains_size11, buf);
buf = insert_only_hash_sets_logger::log ("contains_size12",
stat_contains_size12, buf);
buf = insert_only_hash_sets_logger::log ("contains_size13_or_more",
stat_contains_size13_or_more, buf);
buf = insert_only_hash_sets_logger::log ("grow_from_size0_to_1",
stat_grow_from_size0_to_1, buf);
buf = insert_only_hash_sets_logger::log ("grow_from_size1_to_2",
stat_grow_from_size1_to_2, buf);
buf = insert_only_hash_sets_logger::log ("insert_found_hash_collision",
stat_insert_found_hash_collision,
buf);
buf = insert_only_hash_sets_logger::log ("double_the_number_of_buckets",
stat_double_the_number_of_buckets,
buf);
*buf = '\0';
}
#else
/* No stats. */
#define inc_by(statname, amount) do { } while (false && (amount))
#endif
#define inc(statname) inc_by (statname, 1)
template <typename Key, class HashFcn, class Alloc>
class insert_only_hash_sets
{
public:
typedef Key key_type;
typedef size_t size_type;
typedef Alloc alloc_type;
enum { illegal_key = 1 };
enum { min_capacity = 4 };
#if HASHTABLE_STATS
enum { stats = true };
#else
enum { stats = false };
#endif
/* Do not directly use insert_only_hash_set. Instead, use the
static methods below to create and manipulate objects of the
following class.
Implementation details: each set is represented by a pointer
plus, perhaps, out-of-line data, which would be an object of type
insert_only_hash_set. For a pointer, s, the interpretation is: s
== NULL means empty set, lsb(s) == 1 means a set with one
element, which is (uintptr_t)s - 1, and otherwise s is a pointer
of type insert_only_hash_set*. So, to increase the size of a set
we have to change s and/or *s. To check if a set contains some
key we have to examine s and possibly *s. */
class insert_only_hash_set
{
public:
/* Insert a key. The key must not be a reserved key. */
static inline insert_only_hash_set *insert (key_type key,
insert_only_hash_set *s);
/* Create an empty set. */
static inline insert_only_hash_set *create (size_type capacity);
/* Return whether the given key is present. If key is illegal_key
then either true or false may be returned, but for all other
reserved keys false will be returned. */
static bool
contains (key_type key, const insert_only_hash_set *s)
{
if (stats)
{
inc (stat_contains);
switch (size (s))
{
case 0: inc (stat_contains_size0); break;
case 1: inc (stat_contains_size1); break;
case 2: inc (stat_contains_size2); break;
case 3: inc (stat_contains_size3); break;
case 4: inc (stat_contains_size4); break;
case 5: inc (stat_contains_size5); break;
case 6: inc (stat_contains_size6); break;
case 7: inc (stat_contains_size7); break;
case 8: inc (stat_contains_size8); break;
case 9: inc (stat_contains_size9); break;
case 10: inc (stat_contains_size10); break;
case 11: inc (stat_contains_size11); break;
case 12: inc (stat_contains_size12); break;
default: inc (stat_contains_size13_or_more); break;
}
inc_by (stat_contains_sizes, size (s));
}
return (singleton (s) ?
singleton_key (key) == s :
((s != NULL) && s->contains (key)));
}
/* Return a set's size. */
static size_type
size (const insert_only_hash_set *s)
{ return (s == NULL) ? 0 : (singleton (s) ? 1 : s->num_entries); }
static inline insert_only_hash_set *resize (size_type target_num_buckets,
insert_only_hash_set *s);
private:
/* Return whether a set has size 1. */
static bool
singleton (const insert_only_hash_set *s)
{ return (uintptr_t) s & 1; }
/* Return the representation of a singleton set containing the
given key. */
static insert_only_hash_set *
singleton_key (key_type key)
{ return (insert_only_hash_set *) ((uintptr_t) key + 1); }
/* Given a singleton set, what key does it contain? */
static key_type
extract_singleton_key (const insert_only_hash_set *s)
{
VTV_DEBUG_ASSERT (singleton (s));
return (key_type) ((uintptr_t) s - 1);
}
volatile key_type &
key_at_index (size_type index)
{ return buckets[index]; }
key_type
key_at_index (size_type index) const
{ return buckets[index]; }
size_type
next_index (size_type index, size_type indices_examined) const
{ return (index + indices_examined) & (num_buckets - 1); }
inline void insert_no_resize (key_type key);
inline bool contains (key_type key) const;
inline insert_only_hash_set *resize_if_necessary (void);
size_type num_buckets; /* Must be a power of 2 not less than
min_capacity. */
volatile size_type num_entries;
volatile key_type buckets[0]; /* Actual array size is num_buckets. */
};
/* Create an empty set with the given capacity. Requires that n be
0 or a power of 2. If 1 < n < min_capacity then treat n as
min_capacity. Sets *handle. Returns true unless the allocator
fails. Subsequent operations on this set should use the same
handle. */
static inline bool create (size_type n, insert_only_hash_set **handle);
/* Force the capacity of a set to be n, unless it was more than n
already. Requires that n be 0 or a power of 2. Sets *handle
unless the current capacity is n or more. Returns true unless
the allocator fails. */
static inline bool resize (size_type n, insert_only_hash_set **handle);
/* Insert a key. *handle is unmodified unless (1) a resize occurs,
or (2) the set was initially empty. Returns true unless the
allocator fails during a resize. If the allocator fails during a
resize then the set is reset to be the empty set. The key must
not be a reserved key. */
static inline bool insert (key_type key, insert_only_hash_set **handle);
/* Check for the presence of a key. If key is illegal_key then
either true or false may be returned, but for all other reserved
keys false will be returned. */
static inline bool
contains (key_type key, /* const */ insert_only_hash_set **handle)
{ return insert_only_hash_set::contains (key, *handle); }
/* Return the size of the given set. */
static size_type
size (const insert_only_hash_set **handle)
{ return insert_only_hash_set::size (*handle); }
static bool
is_reserved_key (key_type key)
{ return ((uintptr_t) key % 2) == 1; }
};
template <typename Key, class HashFcn, class Alloc>
typename insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set *
insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set::resize
(size_type n, insert_only_hash_set *s)
{
if (s == NULL)
return create (n);
size_type capacity = singleton (s) ? 1 : s->num_buckets;
if (n <= capacity)
return s;
insert_only_hash_set *result =
create (std::max<size_type> (n, min_capacity));
if (result != NULL)
{
if (singleton (s))
{
result->insert_no_resize (extract_singleton_key (s));
}
else
{
for (size_type i = 0; i < s->num_buckets; i++)
if (s->buckets[i] != (key_type) illegal_key)
result->insert_no_resize (s->buckets[i]);
}
VTV_DEBUG_ASSERT (size (result) == size (s));
}
return result;
}
template <typename Key, class HashFcn, class Alloc>
typename insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set *
insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set::insert
(key_type key, insert_only_hash_set *s)
{
VTV_DEBUG_ASSERT (!is_reserved_key (key));
inc_by (stat_grow_from_size0_to_1, s == NULL);
if (s == NULL)
return singleton_key (key);
if (singleton (s))
{
const key_type old_key = extract_singleton_key (s);
if (old_key == key)
return s;
/* Grow from size 1 to size 2. */
inc (stat_grow_from_size1_to_2);
s = create (2);
if (s == NULL)
return NULL;
s->insert_no_resize (old_key);
s->insert_no_resize (key);
VTV_DEBUG_ASSERT (size (s) == 2);
return s;
}
s = s->resize_if_necessary();
if (s != NULL)
s->insert_no_resize (key);
return s;
}
template <typename Key, class HashFcn, class Alloc>
typename insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set *
insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set::create
(size_type capacity)
{
if (capacity <= 1)
return NULL;
VTV_DEBUG_ASSERT (capacity > 1 && (capacity & (capacity - 1)) == 0);
VTV_DEBUG_ASSERT (sizeof (insert_only_hash_set) == 2 * sizeof (size_type));
capacity = std::max <size_type> (capacity, min_capacity);
const size_t num_bytes = sizeof (insert_only_hash_set) +
sizeof (key_type) * capacity;
alloc_type alloc;
insert_only_hash_set *result = (insert_only_hash_set *) alloc (num_bytes);
result->num_buckets = capacity;
result->num_entries = 0;
for (size_type i = 0; i < capacity; i++)
result->buckets[i] = (key_type) illegal_key;
return result;
}
template <typename Key, class HashFcn, class Alloc>
void
insert_only_hash_sets<Key, HashFcn,
Alloc>::insert_only_hash_set::insert_no_resize
(key_type key)
{
HashFcn hasher;
const size_type capacity = num_buckets;
VTV_DEBUG_ASSERT (capacity >= min_capacity);
VTV_DEBUG_ASSERT (!is_reserved_key (key));
size_type index = hasher (key) & (capacity - 1);
key_type k = key_at_index (index);
size_type indices_examined = 0;
while (k != key)
{
++indices_examined;
if (k == (key_type) illegal_key)
{
key_at_index (index) = key;
++num_entries;
return;
}
else
{
inc_by (stat_insert_found_hash_collision,
hasher (k) == hasher (key));
}
VTV_DEBUG_ASSERT (indices_examined < capacity);
index = next_index (index, indices_examined);
k = key_at_index (index);
}
}
template<typename Key, class HashFcn, class Alloc>
bool
insert_only_hash_sets<Key, HashFcn, Alloc>::insert_only_hash_set::contains
(key_type key) const
{
inc (stat_contains_in_non_trivial_set);
HashFcn hasher;
const size_type capacity = num_buckets;
size_type index = hasher (key) & (capacity - 1);
key_type k = key_at_index (index);
size_type indices_examined = 0;
inc (stat_probes_in_non_trivial_set);
while (k != key)
{
++indices_examined;
if (/*UNLIKELY*/(k == (key_type) illegal_key
|| indices_examined == capacity))
return false;
index = next_index (index, indices_examined);
k = key_at_index (index);
inc (stat_probes_in_non_trivial_set);
}
return true;
}
template <typename Key, class HashFcn, class Alloc>
typename insert_only_hash_sets <Key, HashFcn, Alloc>::insert_only_hash_set *
insert_only_hash_sets<Key, HashFcn,
Alloc>::insert_only_hash_set::resize_if_necessary (void)
{
VTV_DEBUG_ASSERT (num_buckets >= min_capacity);
size_type unused = num_buckets - num_entries;
if (unused < (num_buckets >> 2))
{
inc (stat_double_the_number_of_buckets);
size_type new_num_buckets = num_buckets * 2;
insert_only_hash_set *s = create (new_num_buckets);
for (size_type i = 0; i < num_buckets; i++)
if (buckets[i] != (key_type) illegal_key)
s->insert_no_resize (buckets[i]);
VTV_DEBUG_ASSERT (size (this) == size (s));
return s;
}
else
return this;
}
template<typename Key, class HashFcn, class Alloc>
bool
insert_only_hash_sets<Key, HashFcn, Alloc>::create (size_type n,
insert_only_hash_set **handle)
{
inc (stat_create);
*handle = insert_only_hash_set::create (n);
return (n <= 1) || (*handle != NULL);
}
template<typename Key, class HashFcn, class Alloc>
bool
insert_only_hash_sets<Key, HashFcn, Alloc>::resize (size_type n,
insert_only_hash_set **handle)
{
inc (stat_resize);
*handle = insert_only_hash_set::resize (n, *handle);
return (n <= 1) || (*handle != NULL);
}
template<typename Key, class HashFcn, class Alloc>
bool
insert_only_hash_sets<Key, HashFcn, Alloc>::insert (key_type key,
insert_only_hash_set **handle)
{
inc (stat_insert);
const size_type old_size = insert_only_hash_set::size (*handle);
*handle = insert_only_hash_set::insert (key, *handle);
if (*handle != NULL)
{
const size_type delta = insert_only_hash_set::size (*handle) - old_size;
inc_by (stat_insert_key_that_was_already_present, delta == 0);
}
return *handle != NULL;
}
#endif /* VTV_SET_H */
|