lsquic_malo.c revision 5392f7a3
1/* Copyright (c) 2017 - 2019 LiteSpeed Technologies Inc.  See LICENSE. */
2/*
3 * lsquic_malo.c -- malo allocator implementation.
4 *
5 * The malo allocator is a pool of objects of fixed size.  It tries to
6 * allocate and deallocate objects as fast as possible.  To do so, it
7 * does the following:
8 *
9 *  1. Allocations occur 4 KB at a time.
10 *  2. No division or multiplication operations are performed for
11 *     appropriately sized objects.  (More on this below.)
12 *
13 * (In recent testing, malo was about 2.7 times faster than malloc for
14 * 64-byte objects.)
15 *
16 * Besides speed, two other important characteristics distinguish it
17 * from other pool allocators:
18 *
19 *  1. To free (put) an object, one does not need a pointer to the malo
20 *     object.  This makes this allocator easy to use.
21 *  2. A built-in iterator is provided to iterate over all allocated
22 *     objects (with ability safely to release objects while iterator
23 *     is active).  This may be useful in some circumstances.
24 *
25 * To gain all these advantages, there are trade-offs:
26 *
27 *  1. There are two memory penalties:
28 *      a. Per object overhead.  If an object is at least ROUNDUP_THRESH in
29 *         size as the next power of two, the allocator uses that power of
30 *         two value as the object size.  This is done to avoid using
31 *         division and multiplication.  For example, a 104-byte object
32 *         will have a 24-byte overhead.
33 *      b. Per page overhead.  Page links occupy some bytes in the
34 *         page.  To keep things fast, at least one slot per page is
35 *         always occupied, independent of object size.  Thus, for a
36 *         1 KB object size, 25% of the page is used for the page
37 *         header.
38 *  2. 4 KB pages are not freed until the malo allocator is destroyed.
39 *     This is something to keep in mind.
40 *
41 * P.S. In Russian, "malo" (��������) means "little" or "few".  Thus, the
42 *      malo allocator aims to perform its job in as few CPU cycles as
43 *      possible.
44 */
45
46#include <assert.h>
47#include <errno.h>
48#include <stdint.h>
49#include <stdlib.h>
50#include <sys/queue.h>
51#ifdef WIN32
52#include <vc_compat.h>
53#include <intrin.h>
54#endif
55
56#include "fiu-local.h"
57#include "lsquic_malo.h"
58
59#ifndef LSQUIC_USE_POOLS
60#define LSQUIC_USE_POOLS 1
61#endif
62
63/* 64 slots in a 4KB page means that the smallest object is 64 bytes.
64 * The largest object is 2KB.
65 */
66#define MALO_MIN_NBITS 6
67#define MALO_MAX_NBITS 11
68
69#define ROUNDUP_THRESH 0.75f
70
71/* A "free page" is a page with free slots available.
72 */
73
74#if LSQUIC_USE_POOLS
75static unsigned find_free_slot (uint64_t slots);
76static unsigned size_in_bits (size_t sz);
77#endif
78
79struct malo_page {
80    SLIST_ENTRY(malo_page)  next_page;
81    LIST_ENTRY(malo_page)   next_free_page;
82    struct malo            *malo;
83    uint64_t                slots,
84                            full_slot_mask;
85    unsigned                nbits;  /* If pow is zero, stores object size */
86    unsigned                initial_slot;
87    int                     pow;    /* True if object is power of 2 */
88};
89
90typedef char malo_header_fits_in_one_slot
91    [(sizeof(struct malo_page) > (1 << MALO_MIN_NBITS)) ? -1 : 1];
92
93#if !LSQUIC_USE_POOLS
94struct nopool_elem
95{
96    TAILQ_ENTRY(nopool_elem)    next;
97    struct malo                *malo;
98    unsigned char               data[0];
99};
100#endif
101
102struct malo {
103#if LSQUIC_USE_POOLS
104    struct malo_page        page_header;
105    SLIST_HEAD(, malo_page) all_pages;
106    LIST_HEAD(, malo_page)  free_pages;
107    struct {
108        struct malo_page   *cur_page;
109        unsigned            next_slot;
110    }                       iter;
111#else
112    /* List of all elements: used by the iterator */
113    TAILQ_HEAD(, nopool_elem)   elems;
114    size_t                      obj_size;
115    struct nopool_elem         *next_iter_elem;
116#endif
117};
118
119struct malo *
120lsquic_malo_create (size_t obj_size)
121{
122#if LSQUIC_USE_POOLS
123    int pow, n_slots;
124    unsigned nbits;
125
126    obj_size = (obj_size + 7) & -8;
127    nbits = size_in_bits(obj_size);
128    if (nbits < MALO_MIN_NBITS)
129        nbits = MALO_MIN_NBITS;
130    else if (nbits > MALO_MAX_NBITS)
131    {
132        errno = EOVERFLOW;
133        return NULL;
134    }
135
136    pow = obj_size <= (1 << MALO_MIN_NBITS)
137          || (float) obj_size / (1 << nbits) > ROUNDUP_THRESH;
138
139    struct malo *malo;
140    if (0 != posix_memalign((void **) &malo, 0x1000, 0x1000))
141        return NULL;
142
143    SLIST_INIT(&malo->all_pages);
144    LIST_INIT(&malo->free_pages);
145    malo->iter.cur_page = &malo->page_header;
146    malo->iter.next_slot = 0;
147
148    if (pow)
149        n_slots =   sizeof(*malo) / (1 << nbits)
150                + ((sizeof(*malo) % (1 << nbits)) > 0);
151    else
152        n_slots =   sizeof(*malo) / obj_size
153                + ((sizeof(*malo) % obj_size) > 0);
154
155    struct malo_page *const page = &malo->page_header;
156    SLIST_INSERT_HEAD(&malo->all_pages, page, next_page);
157    LIST_INSERT_HEAD(&malo->free_pages, page, next_free_page);
158    page->malo = malo;
159    if (!pow)
160        page->full_slot_mask = (1ULL << (0x1000 / obj_size)) - 1;
161    else if (nbits == MALO_MIN_NBITS)
162        page->full_slot_mask = ~0ULL;
163    else
164        page->full_slot_mask = (1ULL << (1 << (12 - nbits))) - 1;
165    page->slots = (1ULL << n_slots) - 1;
166    page->pow = pow;
167    page->nbits = pow ? nbits : obj_size;
168    page->initial_slot = n_slots;
169
170    return malo;
171#else
172    struct malo *malo;
173
174    /* Use the same sizing mechanism as in the normal version */
175    if (obj_size < (1 << MALO_MIN_NBITS))
176        obj_size = 1 << MALO_MIN_NBITS;
177    else
178        obj_size = (obj_size + 7) & -8;
179
180    malo = malloc(sizeof(*malo));
181    if (malo)
182    {
183        TAILQ_INIT(&malo->elems);
184        malo->obj_size = obj_size;
185        return malo;
186    }
187    else
188        return NULL;
189#endif
190}
191
192
193#if LSQUIC_USE_POOLS
194static struct malo_page *
195allocate_page (struct malo *malo)
196{
197    struct malo_page *page;
198    if (0 != posix_memalign((void **) &page, 0x1000, 0x1000))
199        return NULL;
200    SLIST_INSERT_HEAD(&malo->all_pages, page, next_page);
201    LIST_INSERT_HEAD(&malo->free_pages, page, next_free_page);
202    page->slots = 1;
203    page->full_slot_mask = malo->page_header.full_slot_mask;
204    page->nbits = malo->page_header.nbits;
205    page->pow = malo->page_header.pow;
206    page->malo = malo;
207    page->initial_slot = 1;
208    return page;
209}
210#endif
211
212
213#define FAIL_NOMEM do { errno = ENOMEM; return NULL; } while (0)
214
215/* Get a new object. */
216void *
217lsquic_malo_get (struct malo *malo)
218{
219#if LSQUIC_USE_POOLS
220    fiu_do_on("malo/get", FAIL_NOMEM);
221    struct malo_page *page = LIST_FIRST(&malo->free_pages);
222    if (!page)
223    {
224        page = allocate_page(malo);
225        if (!page)
226            return NULL;
227    }
228    unsigned slot = find_free_slot(page->slots);
229    page->slots |= (1ULL << slot);
230    if (page->full_slot_mask == page->slots)
231        LIST_REMOVE(page, next_free_page);
232    if (page->pow)
233        return (char *) page + (slot << page->nbits);
234    else
235        return (char *) page + (slot * page->nbits);
236#else
237    struct nopool_elem *el;
238    el = malloc(sizeof(*el) + malo->obj_size);
239    if (el)
240    {
241        TAILQ_INSERT_HEAD(&malo->elems, el, next);
242        el->malo = malo;
243        return el->data;
244    }
245    else
246        return NULL;
247#endif
248}
249
250
251/* Return obj to the pool */
252void
253lsquic_malo_put (void *obj)
254{
255#if LSQUIC_USE_POOLS
256    uintptr_t page_addr = (uintptr_t) obj & ~((1 << 12) - 1);
257    struct malo_page *page = (void *) page_addr;
258    unsigned slot;
259    if (page->pow)
260        slot = ((uintptr_t) obj - page_addr) >> page->nbits;
261    else
262        slot = ((uintptr_t) obj - page_addr) / page->nbits;
263    if (page->full_slot_mask == page->slots)
264        LIST_INSERT_HEAD(&page->malo->free_pages, page, next_free_page);
265    page->slots &= ~(1ULL << slot);
266#else
267    struct nopool_elem *el;
268    el = (struct nopool_elem *) ((char *) obj - sizeof(*el));
269    if (el == el->malo->next_iter_elem)
270        el->malo->next_iter_elem = TAILQ_NEXT(el->malo->next_iter_elem, next);
271    TAILQ_REMOVE(&el->malo->elems, el, next);
272    free(el);
273#endif
274}
275
276
277void
278lsquic_malo_destroy (struct malo *malo)
279{
280#if LSQUIC_USE_POOLS
281    struct malo_page *page, *next;
282    page = SLIST_FIRST(&malo->all_pages);
283    while (page != &malo->page_header)
284    {
285        next = SLIST_NEXT(page, next_page);
286#ifndef WIN32
287        free(page);
288#else
289        _aligned_free(page);
290#endif
291        page = next;
292    }
293#ifndef WIN32
294    free(page);
295#else
296        _aligned_free(page);
297#endif
298#else
299    struct nopool_elem *el, *next_el;
300    for (el = TAILQ_FIRST(&malo->elems); el; el = next_el)
301    {
302        next_el = TAILQ_NEXT(el, next);
303        free(el);
304    }
305    free(malo);
306#endif
307}
308
309
310/* The iterator is built-in.  Usage:
311 * void *obj;
312 * for (obj = lsquic_malo_first(malo); obj; lsquic_malo_next(malo))
313 *     do_stuff(obj);
314 */
315void *
316lsquic_malo_first (struct malo *malo)
317{
318#if LSQUIC_USE_POOLS
319    malo->iter.cur_page = SLIST_FIRST(&malo->all_pages);
320    malo->iter.next_slot = malo->iter.cur_page->initial_slot;
321#else
322    malo->next_iter_elem = TAILQ_FIRST(&malo->elems);
323#endif
324    return lsquic_malo_next(malo);
325}
326
327
328void *
329lsquic_malo_next (struct malo *malo)
330{
331#if LSQUIC_USE_POOLS
332    struct malo_page *page;
333    unsigned max_slot, slot;
334
335    page = malo->iter.cur_page;
336    if (page)
337    {
338        if (page->pow)
339            max_slot = 1 << (12 - page->nbits);     /* Same for all pages */
340        else
341            max_slot = 0x1000 / page->nbits;
342        slot = malo->iter.next_slot;
343        while (1)
344        {
345            for (; slot < max_slot; ++slot)
346            {
347                if (page->slots & (1ULL << slot))
348                {
349                    malo->iter.cur_page  = page;
350                    malo->iter.next_slot = slot + 1;
351                    if (page->pow)
352                        return (char *) page + (slot << page->nbits);
353                    else
354                    {
355                        assert(slot * (page->nbits + 1) < 0x1000);
356                        return (char *) page + (slot * page->nbits);
357                    }
358                }
359            }
360            page = SLIST_NEXT(page, next_page);
361            if (page)
362                slot = page->initial_slot;
363            else
364            {
365                malo->iter.cur_page = NULL;     /* Stop iterator */
366                return NULL;
367            }
368        }
369    }
370
371    return NULL;
372#else
373    struct nopool_elem *retval;
374
375    if (malo->next_iter_elem)
376    {
377        retval = malo->next_iter_elem;
378        malo->next_iter_elem = TAILQ_NEXT(malo->next_iter_elem, next);
379        return retval->data;
380    }
381    else
382        return NULL;
383#endif
384}
385
386
387#if LSQUIC_USE_POOLS
388static unsigned
389size_in_bits (size_t sz)
390{
391#if __GNUC__
392    unsigned clz = sz > 1 ? __builtin_clz(sz - 1) : 31;
393    return 32 - clz;
394#elif defined(WIN32)
395    unsigned char s;
396    unsigned long idx;
397    s = _BitScanReverse(&idx, sz);
398    assert(s);
399    return (unsigned) idx + 1;
400#else
401#error This function contains a bug!
402    unsigned clz;
403    size_t y;
404
405    --sz;
406    clz = 32;
407    y = sz >> 16;   if (y) { clz -= 16; sz = y; }
408    y = sz >>  8;   if (y) { clz -=  8; sz = y; }
409    y = sz >>  4;   if (y) { clz -=  4; sz = y; }
410    y = sz >>  2;   if (y) { clz -=  2; sz = y; }
411    y = sz >>  1;   if (y) return 32 - clz + 1;
412    return 32 - clz + sz;
413#endif
414}
415
416
417static unsigned
418find_free_slot (uint64_t slots)
419{
420#if __GNUC__
421    return __builtin_ffsll(~slots) - 1;
422#else
423    unsigned n;
424
425    slots =~ slots;
426    n = 0;
427
428    if (0 == (slots & ((1ULL << 32) - 1))) { n += 32; slots >>= 32; }
429    if (0 == (slots & ((1ULL << 16) - 1))) { n += 16; slots >>= 16; }
430    if (0 == (slots & ((1ULL <<  8) - 1))) { n +=  8; slots >>=  8; }
431    if (0 == (slots & ((1ULL <<  4) - 1))) { n +=  4; slots >>=  4; }
432    if (0 == (slots & ((1ULL <<  2) - 1))) { n +=  2; slots >>=  2; }
433    if (0 == (slots & ((1ULL <<  1) - 1))) { n +=  1; slots >>=  1; }
434    return n;
435#endif
436}
437#endif
438
439
440size_t
441lsquic_malo_mem_used (const struct malo *malo)
442{
443#if LSQUIC_USE_POOLS
444    const struct malo_page *page;
445    size_t size;
446
447    size = 0;
448    SLIST_FOREACH(page, &malo->all_pages, next_page)
449        size += sizeof(*page);
450
451    return size;
452#else
453    return 0;
454#endif
455}
456