1/* Copyright (c) 2017 - 2022 LiteSpeed Technologies Inc.  See LICENSE. */
2/*
3 * lsquic_malo.c -- malo allocator implementation.
4 *
5 * The malo allocator is a pool of objects of fixed size.  It tries to
6 * allocate and deallocate objects as fast as possible.  To do so, it
7 * does the following:
8 *
9 *  1. Allocations occur 4 KB at a time.
10 *  2. No division or multiplication operations are performed for
11 *     appropriately sized objects.  (More on this below.)
12 *
13 * (In recent testing, malo was about 2.7 times faster than malloc for
14 * 64-byte objects.)
15 *
16 * Besides speed, two other important characteristics distinguish it
17 * from other pool allocators:
18 *
19 *  1. To free (put) an object, one does not need a pointer to the malo
20 *     object.  This makes this allocator easy to use.
21 *  2. A built-in iterator is provided to iterate over all allocated
22 *     objects (with ability safely to release objects while iterator
23 *     is active).  This may be useful in some circumstances.
24 *
25 * To gain all these advantages, there are trade-offs:
26 *
27 *  1. There are two memory penalties:
28 *      a. Per object overhead.  If an object is at least ROUNDUP_THRESH in
29 *         size as the next power of two, the allocator uses that power of
30 *         two value as the object size.  This is done to avoid using
31 *         division and multiplication.  For example, a 104-byte object
32 *         will have a 24-byte overhead.
33 *      b. Per page overhead.  Page links occupy some bytes in the
34 *         page.  To keep things fast, at least one slot per page is
35 *         always occupied, independent of object size.  Thus, for a
36 *         1 KB object size, 25% of the page is used for the page
37 *         header.
38 *  2. 4 KB pages are not freed until the malo allocator is destroyed.
39 *     This is something to keep in mind.
40 *
41 * P.S. In Russian, "malo" (��������) means "little" or "few".  Thus, the
42 *      malo allocator aims to perform its job in as few CPU cycles as
43 *      possible.
44 */
45
46#include <assert.h>
47#include <errno.h>
48#include <stdint.h>
49#include <stdlib.h>
50#include <sys/queue.h>
51#ifdef WIN32
52#include <vc_compat.h>
53#include <intrin.h>
54#endif
55
56#include "fiu-local.h"
57#include "lsquic_malo.h"
58
59#ifndef LSQUIC_USE_POOLS
60#define LSQUIC_USE_POOLS 1
61#endif
62
63/* 64 slots in a 4KB page means that the smallest object is 64 bytes.
64 * The largest object is 2KB.
65 */
66#define MALO_MIN_NBITS 6
67#define MALO_MAX_NBITS 11
68
69#define ROUNDUP_THRESH 0.75f
70
71/* A "free page" is a page with free slots available.
72 */
73
74#if LSQUIC_USE_POOLS
75static unsigned find_free_slot (uint64_t slots);
76static unsigned size_in_bits (size_t sz);
77#endif
78
79struct malo_page {
80    SLIST_ENTRY(malo_page)  next_page;
81    LIST_ENTRY(malo_page)   next_free_page;
82    struct malo            *malo;
83    uint64_t                slots,
84                            full_slot_mask;
85    unsigned                nbits;  /* If pow is zero, stores object size */
86    unsigned                initial_slot;
87    int                     pow;    /* True if object is power of 2 */
88};
89
90typedef char malo_header_fits_in_one_slot
91    [(sizeof(struct malo_page) > (1 << MALO_MIN_NBITS)) ? -1 : 1];
92
93#if !LSQUIC_USE_POOLS
94struct nopool_elem
95{
96    TAILQ_ENTRY(nopool_elem)    next;
97    struct malo                *malo;
98    unsigned char               data[0];
99};
100#endif
101
102struct malo {
103#if LSQUIC_USE_POOLS
104    struct malo_page        page_header;
105    SLIST_HEAD(, malo_page) all_pages;
106    LIST_HEAD(, malo_page)  free_pages;
107    struct {
108        struct malo_page   *cur_page;
109        unsigned            next_slot;
110    }                       iter;
111#else
112    /* List of all elements: used by the iterator */
113    TAILQ_HEAD(, nopool_elem)   elems;
114    size_t                      obj_size;
115    struct nopool_elem         *next_iter_elem;
116#endif
117};
118
119struct malo *
120lsquic_malo_create (size_t obj_size)
121{
122#if LSQUIC_USE_POOLS
123    int pow, n_slots;
124    unsigned nbits;
125
126    obj_size = (obj_size + 7) & -8;
127    nbits = size_in_bits(obj_size);
128    if (nbits < MALO_MIN_NBITS)
129        nbits = MALO_MIN_NBITS;
130    else if (nbits > MALO_MAX_NBITS)
131    {
132        errno = EOVERFLOW;
133        return NULL;
134    }
135
136    pow = obj_size <= (1 << MALO_MIN_NBITS)
137          || (float) obj_size / (1 << nbits) > ROUNDUP_THRESH;
138
139    struct malo *malo;
140    if (0 != posix_memalign((void **) &malo, 0x1000, 0x1000))
141        return NULL;
142
143    SLIST_INIT(&malo->all_pages);
144    LIST_INIT(&malo->free_pages);
145    malo->iter.cur_page = &malo->page_header;
146    malo->iter.next_slot = 0;
147
148    if (pow)
149        n_slots =   sizeof(*malo) / (1 << nbits)
150                + ((sizeof(*malo) % (1 << nbits)) > 0);
151    else
152        n_slots =   sizeof(*malo) / obj_size
153                + ((sizeof(*malo) % obj_size) > 0);
154
155    struct malo_page *const page = &malo->page_header;
156    SLIST_INSERT_HEAD(&malo->all_pages, page, next_page);
157    LIST_INSERT_HEAD(&malo->free_pages, page, next_free_page);
158    page->malo = malo;
159    if (!pow)
160        page->full_slot_mask = (1ULL << (0x1000 / obj_size)) - 1;
161    else if (nbits == MALO_MIN_NBITS)
162        page->full_slot_mask = ~0ULL;
163    else
164        page->full_slot_mask = (1ULL << (1 << (12 - nbits))) - 1;
165    page->slots = (1ULL << n_slots) - 1;
166    page->pow = pow;
167    page->nbits = pow ? nbits : obj_size;
168    page->initial_slot = n_slots;
169
170    return malo;
171#else
172    struct malo *malo;
173
174    /* Use the same sizing mechanism as in the normal version */
175    if (obj_size < (1 << MALO_MIN_NBITS))
176        obj_size = 1 << MALO_MIN_NBITS;
177    else
178        obj_size = (obj_size + 7) & -8;
179
180    malo = malloc(sizeof(*malo));
181    if (malo)
182    {
183        TAILQ_INIT(&malo->elems);
184        malo->obj_size = obj_size;
185        malo->next_iter_elem = NULL;
186        return malo;
187    }
188    else
189        return NULL;
190#endif
191}
192
193
194#if LSQUIC_USE_POOLS
195static struct malo_page *
196allocate_page (struct malo *malo)
197{
198    struct malo_page *page;
199    if (0 != posix_memalign((void **) &page, 0x1000, 0x1000))
200        return NULL;
201    SLIST_INSERT_HEAD(&malo->all_pages, page, next_page);
202    LIST_INSERT_HEAD(&malo->free_pages, page, next_free_page);
203    page->slots = 1;
204    page->full_slot_mask = malo->page_header.full_slot_mask;
205    page->nbits = malo->page_header.nbits;
206    page->pow = malo->page_header.pow;
207    page->malo = malo;
208    page->initial_slot = 1;
209    return page;
210}
211#endif
212
213
214#define FAIL_NOMEM do { errno = ENOMEM; return NULL; } while (0)
215
216/* Get a new object. */
217void *
218lsquic_malo_get (struct malo *malo)
219{
220#if LSQUIC_USE_POOLS
221    fiu_do_on("malo/get", FAIL_NOMEM);
222    struct malo_page *page = LIST_FIRST(&malo->free_pages);
223    if (!page)
224    {
225        page = allocate_page(malo);
226        if (!page)
227            return NULL;
228    }
229    unsigned slot = find_free_slot(page->slots);
230    page->slots |= (1ULL << slot);
231    if (page->full_slot_mask == page->slots)
232        LIST_REMOVE(page, next_free_page);
233    if (page->pow)
234        return (char *) page + (slot << page->nbits);
235    else
236        return (char *) page + (slot * page->nbits);
237#else
238    struct nopool_elem *el;
239    el = malloc(sizeof(*el) + malo->obj_size);
240    if (el)
241    {
242        TAILQ_INSERT_HEAD(&malo->elems, el, next);
243        el->malo = malo;
244        return el->data;
245    }
246    else
247        return NULL;
248#endif
249}
250
251
252/* Return obj to the pool */
253void
254lsquic_malo_put (void *obj)
255{
256#if LSQUIC_USE_POOLS
257    uintptr_t page_addr = (uintptr_t) obj & ~((1 << 12) - 1);
258    struct malo_page *page = (void *) page_addr;
259    unsigned slot;
260    if (page->pow)
261        slot = ((uintptr_t) obj - page_addr) >> page->nbits;
262    else
263        slot = ((uintptr_t) obj - page_addr) / page->nbits;
264    if (page->full_slot_mask == page->slots)
265        LIST_INSERT_HEAD(&page->malo->free_pages, page, next_free_page);
266    page->slots &= ~(1ULL << slot);
267#else
268    struct nopool_elem *el;
269    el = (struct nopool_elem *) ((char *) obj - sizeof(*el));
270    if (el == el->malo->next_iter_elem)
271        el->malo->next_iter_elem = TAILQ_NEXT(el->malo->next_iter_elem, next);
272    TAILQ_REMOVE(&el->malo->elems, el, next);
273    free(el);
274#endif
275}
276
277
278void
279lsquic_malo_destroy (struct malo *malo)
280{
281#if LSQUIC_USE_POOLS
282    struct malo_page *page, *next;
283    page = SLIST_FIRST(&malo->all_pages);
284    while (page != &malo->page_header)
285    {
286        next = SLIST_NEXT(page, next_page);
287#ifndef WIN32
288        free(page);
289#else
290        _aligned_free(page);
291#endif
292        page = next;
293    }
294#ifndef WIN32
295    free(page);
296#else
297        _aligned_free(page);
298#endif
299#else
300    struct nopool_elem *el, *next_el;
301    for (el = TAILQ_FIRST(&malo->elems); el; el = next_el)
302    {
303        next_el = TAILQ_NEXT(el, next);
304        free(el);
305    }
306    free(malo);
307#endif
308}
309
310
311/* The iterator is built-in.  Usage:
312 * void *obj;
313 * for (obj = lsquic_malo_first(malo); obj; lsquic_malo_next(malo))
314 *     do_stuff(obj);
315 */
316void *
317lsquic_malo_first (struct malo *malo)
318{
319#if LSQUIC_USE_POOLS
320    malo->iter.cur_page = SLIST_FIRST(&malo->all_pages);
321    malo->iter.next_slot = malo->iter.cur_page->initial_slot;
322#else
323    malo->next_iter_elem = TAILQ_FIRST(&malo->elems);
324#endif
325    return lsquic_malo_next(malo);
326}
327
328
329void *
330lsquic_malo_next (struct malo *malo)
331{
332#if LSQUIC_USE_POOLS
333    struct malo_page *page;
334    unsigned max_slot, slot;
335
336    page = malo->iter.cur_page;
337    if (page)
338    {
339        if (page->pow)
340            max_slot = 1 << (12 - page->nbits);     /* Same for all pages */
341        else
342            max_slot = 0x1000 / page->nbits;
343        slot = malo->iter.next_slot;
344        while (1)
345        {
346            for (; slot < max_slot; ++slot)
347            {
348                if (page->slots & (1ULL << slot))
349                {
350                    malo->iter.cur_page  = page;
351                    malo->iter.next_slot = slot + 1;
352                    if (page->pow)
353                        return (char *) page + (slot << page->nbits);
354                    else
355                    {
356                        assert(slot * (page->nbits + 1) < 0x1000);
357                        return (char *) page + (slot * page->nbits);
358                    }
359                }
360            }
361            page = SLIST_NEXT(page, next_page);
362            if (page)
363                slot = page->initial_slot;
364            else
365            {
366                malo->iter.cur_page = NULL;     /* Stop iterator */
367                return NULL;
368            }
369        }
370    }
371
372    return NULL;
373#else
374    struct nopool_elem *retval;
375
376    if (malo->next_iter_elem)
377    {
378        retval = malo->next_iter_elem;
379        malo->next_iter_elem = TAILQ_NEXT(malo->next_iter_elem, next);
380        return retval->data;
381    }
382    else
383        return NULL;
384#endif
385}
386
387
388#if LSQUIC_USE_POOLS
389static unsigned
390size_in_bits (size_t sz)
391{
392#if __GNUC__
393    unsigned clz = sz > 1 ? __builtin_clz(sz - 1) : 31;
394    return 32 - clz;
395#elif defined(WIN32)
396    unsigned char s;
397    unsigned long idx;
398    s = _BitScanReverse(&idx, sz);
399    assert(s);
400    return (unsigned) idx + 1;
401#else
402#error This function contains a bug!
403    unsigned clz;
404    size_t y;
405
406    --sz;
407    clz = 32;
408    y = sz >> 16;   if (y) { clz -= 16; sz = y; }
409    y = sz >>  8;   if (y) { clz -=  8; sz = y; }
410    y = sz >>  4;   if (y) { clz -=  4; sz = y; }
411    y = sz >>  2;   if (y) { clz -=  2; sz = y; }
412    y = sz >>  1;   if (y) return 32 - clz + 1;
413    return 32 - clz + sz;
414#endif
415}
416
417
418static unsigned
419find_free_slot (uint64_t slots)
420{
421#if __GNUC__
422    return __builtin_ffsll(~slots) - 1;
423#else
424    unsigned n;
425
426    slots =~ slots;
427    n = 0;
428
429    if (0 == (slots & ((1ULL << 32) - 1))) { n += 32; slots >>= 32; }
430    if (0 == (slots & ((1ULL << 16) - 1))) { n += 16; slots >>= 16; }
431    if (0 == (slots & ((1ULL <<  8) - 1))) { n +=  8; slots >>=  8; }
432    if (0 == (slots & ((1ULL <<  4) - 1))) { n +=  4; slots >>=  4; }
433    if (0 == (slots & ((1ULL <<  2) - 1))) { n +=  2; slots >>=  2; }
434    if (0 == (slots & ((1ULL <<  1) - 1))) { n +=  1; slots >>=  1; }
435    return n;
436#endif
437}
438#endif
439
440
441size_t
442lsquic_malo_mem_used (const struct malo *malo)
443{
444#if LSQUIC_USE_POOLS
445    const struct malo_page *page;
446    size_t size;
447
448    size = 0;
449    SLIST_FOREACH(page, &malo->all_pages, next_page)
450        size += sizeof(*page);
451
452    return size;
453#else
454    return 0;
455#endif
456}
457