lsquic_di_nocopy.c revision 229fce07
1/* Copyright (c) 2017 - 2019 LiteSpeed Technologies Inc.  See LICENSE. */
2/*
3 * lsquic_di_nocopy.c -- The "no-copy" data in stream.
4 *
5 * Data from packets is not copied: the packets are referenced by stream
6 * frames.  When all data from stream frame is read, the frame is released
7 * and packet reference count is decremented, which possibly results in
8 * packet being released as well.
9 *
10 * This approach works well in regular circumstances; there are two scenarios
11 * when it does not:
12 *
13 *  A.  If we have many out-of-order frames, insertion into the list becomes
14 *      expensive.  In the degenerate case, we'd have to traverse the whole
15 *      list to find appropriate position.
16 *
17 *  B.  Having many frames ties up resources, as each frame keeps a reference
18 *      to the packet that contains it.  This is a possible attack vector:
19 *      send many one-byte packets; a single hole at the beginning will stop
20 *      the server from being able to read the stream, thus tying up resources.
21 *
22 * If we detect that either (A) or (B) is true, we request that the stream
23 * switch to a more robust incoming stream frame handler by setting
24 * DI_SWITCH_IMPL flag.
25 *
26 * For a small number of elements, (A) and (B) do not matter and the checks
27 * are not performed.  This number is defined by EFF_CHECK_THRESH_LOW.  On
28 * the other side of the spectrum, if the number of frames grows very high,
29 * we want to switch to a more memory-efficient implementation even if (A)
30 * and (B) are not true.  EFF_CHECK_THRESH_HIGH defines this threshold.
31 *
32 * Between the low and high thresholds, we detect efficiency problems as
33 * follows.
34 *
35 * To detect (A), we count how many elements we have to traverse during
36 * insertion.  If we have to traverse at least half the list
37 * EFF_FAR_TRAVERSE_COUNT in a row, DI_SWITCH_IMPL is issued.
38 *
39 * If average stream frame size is smaller than EFF_TINY_FRAME_SZ bytes,
40 * (B) condition is true.  In addition, if there are more than EFF_MAX_HOLES
41 * in the stream, this is also indicative of (B).
42 */
43
44
45#include <assert.h>
46#include <inttypes.h>
47#include <stddef.h>
48#include <stdint.h>
49#include <stdlib.h>
50#include <sys/queue.h>
51
52#include "lsquic.h"
53#include "lsquic_types.h"
54#include "lsquic_int_types.h"
55#include "lsquic_conn_flow.h"
56#include "lsquic_packet_common.h"
57#include "lsquic_packet_in.h"
58#include "lsquic_rtt.h"
59#include "lsquic_sfcw.h"
60#include "lsquic_stream.h"
61#include "lsquic_mm.h"
62#include "lsquic_malo.h"
63#include "lsquic_conn.h"
64#include "lsquic_conn_public.h"
65#include "lsquic_data_in_if.h"
66
67
68#define LSQUIC_LOGGER_MODULE LSQLM_DI
69#define LSQUIC_LOG_CONN_ID ncdi->ncdi_conn_pub->lconn->cn_cid
70#define LSQUIC_LOG_STREAM_ID ncdi->ncdi_stream_id
71#include "lsquic_logger.h"
72
73
74/* If number of elements is at or below this number, we do not bother to check
75 * efficiency conditions.
76 */
77#define EFF_CHECK_THRESH_LOW    10
78
79/* If number of elements is higher than this number, efficiency alert
80 * is issued unconditionally.
81 */
82#define EFF_CHECK_THRESH_HIGH   1000
83
84/* Maximum number of consecutive far traversals */
85#define EFF_FAR_TRAVERSE_COUNT  4
86
87/* Maximum number of holes that is not deemed suspicious */
88#define EFF_MAX_HOLES           5
89
90/* What is deemed a tiny frame, in bytes.  If it is a power of two, calculation
91 * is cheaper.
92 */
93#define EFF_TINY_FRAME_SZ       64
94
95
96TAILQ_HEAD(stream_frames_tailq, stream_frame);
97
98
99struct nocopy_data_in
100{
101    struct stream_frames_tailq  ncdi_frames_in;
102    struct data_in              ncdi_data_in;
103    struct lsquic_conn_public  *ncdi_conn_pub;
104    uint64_t                    ncdi_byteage;
105    uint64_t                    ncdi_fin_off;
106    uint32_t                    ncdi_stream_id;
107    unsigned                    ncdi_n_frames;
108    unsigned                    ncdi_n_holes;
109    unsigned                    ncdi_cons_far;
110    enum {
111        NCDI_FIN_SET        = 1 << 0,
112        NCDI_FIN_REACHED    = 1 << 1,
113    }                           ncdi_flags;
114};
115
116
117#define NCDI_PTR(data_in) (struct nocopy_data_in *) \
118    ((unsigned char *) (data_in) - offsetof(struct nocopy_data_in, ncdi_data_in))
119
120#define STREAM_FRAME_PTR(data_frame) (struct stream_frame *) \
121    ((unsigned char *) (data_frame) - offsetof(struct stream_frame, data_frame))
122
123
124static const struct data_in_iface *di_if_nocopy_ptr;
125
126
127struct data_in *
128data_in_nocopy_new (struct lsquic_conn_public *conn_pub, uint32_t stream_id)
129{
130    struct nocopy_data_in *ncdi;
131
132    ncdi = malloc(sizeof(*ncdi));
133    if (!ncdi)
134        return NULL;
135
136    TAILQ_INIT(&ncdi->ncdi_frames_in);
137    ncdi->ncdi_data_in.di_if    = di_if_nocopy_ptr;
138    ncdi->ncdi_data_in.di_flags = 0;
139    ncdi->ncdi_conn_pub         = conn_pub;
140    ncdi->ncdi_stream_id        = stream_id;
141    ncdi->ncdi_byteage          = 0;
142    ncdi->ncdi_n_frames         = 0;
143    ncdi->ncdi_n_holes          = 0;
144    ncdi->ncdi_cons_far         = 0;
145    ncdi->ncdi_fin_off          = 0;
146    ncdi->ncdi_flags            = 0;
147    LSQ_DEBUG("initialized");
148    return &ncdi->ncdi_data_in;
149}
150
151
152static void
153nocopy_di_destroy (struct data_in *data_in)
154{
155    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
156    stream_frame_t *frame;
157    while ((frame = TAILQ_FIRST(&ncdi->ncdi_frames_in)))
158    {
159        TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame);
160        lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in);
161        lsquic_malo_put(frame);
162    }
163    free(ncdi);
164}
165
166
167#define DF_OFF(frame) (frame)->data_frame.df_offset
168#define DF_FIN(frame) (frame)->data_frame.df_fin
169#define DF_SIZE(frame) (frame)->data_frame.df_size
170#define DF_END(frame) (DF_OFF(frame) + DF_SIZE(frame))
171
172
173#if LSQUIC_EXTRA_CHECKS
174static int
175frame_list_is_sane (const struct nocopy_data_in *ncdi)
176{
177    const stream_frame_t *frame;
178    uint64_t prev_off = 0, prev_end = 0;
179    int ordered = 1, overlaps = 0;
180    TAILQ_FOREACH(frame, &ncdi->ncdi_frames_in, next_frame)
181    {
182        ordered &= prev_off <= DF_OFF(frame);
183        overlaps |= prev_end > DF_OFF(frame);
184        prev_off = DF_OFF(frame);
185        prev_end = DF_END(frame);
186    }
187    return ordered && !overlaps;
188}
189
190
191#define CHECK_ORDER(ncdi) assert(frame_list_is_sane(ncdi))
192#else
193#define CHECK_ORDER(ncdi)
194#endif
195
196
197#define CASE(letter) ((int) (letter) << 8)
198
199/* Not all errors are picked up by this function, as it is expensive (and
200 * potentially error-prone) to check for all possible error conditions.
201 * It an error be misclassified as an overlap or dup, in the worst case
202 * we end up with an application error instead of protocol violation.
203 */
204static int
205insert_frame (struct nocopy_data_in *ncdi, struct stream_frame *new_frame,
206                                    uint64_t read_offset, unsigned *p_n_frames)
207{
208    stream_frame_t *prev_frame, *next_frame;
209    unsigned count;
210
211    if (read_offset > DF_END(new_frame))
212    {
213        if (DF_FIN(new_frame))
214            return INS_FRAME_ERR                                | CASE('A');
215        else
216            return INS_FRAME_DUP                                | CASE('B');
217    }
218
219    if (ncdi->ncdi_flags & NCDI_FIN_SET)
220    {
221        if (DF_FIN(new_frame) && DF_END(new_frame) != ncdi->ncdi_fin_off)
222            return INS_FRAME_ERR                                | CASE('C');
223        if (DF_END(new_frame) > ncdi->ncdi_fin_off)
224            return INS_FRAME_ERR                                | CASE('D');
225        if (read_offset == DF_END(new_frame))
226            return INS_FRAME_DUP                                | CASE('M');
227    }
228    else
229    {
230        if (read_offset == DF_END(new_frame) && !DF_FIN(new_frame))
231            return INS_FRAME_DUP                                | CASE('L');
232    }
233
234    /* Find position in the list, going backwards.  We go backwards because
235     * that is the most likely scenario.
236     */
237    next_frame = TAILQ_LAST(&ncdi->ncdi_frames_in, stream_frames_tailq);
238    if (next_frame && DF_OFF(new_frame) < DF_OFF(next_frame))
239    {
240        count = 1;
241        prev_frame = TAILQ_PREV(next_frame, stream_frames_tailq, next_frame);
242        for ( ; prev_frame && DF_OFF(new_frame) < DF_OFF(next_frame);
243                next_frame = prev_frame,
244                    prev_frame = TAILQ_PREV(prev_frame, stream_frames_tailq, next_frame))
245        {
246            if (DF_OFF(new_frame) >= DF_OFF(prev_frame))
247                break;
248            ++count;
249        }
250    }
251    else
252    {
253        count = 0;
254        prev_frame = NULL;
255    }
256
257    if (!prev_frame && next_frame && DF_OFF(new_frame) >= DF_OFF(next_frame))
258    {
259        prev_frame = next_frame;
260        next_frame = TAILQ_NEXT(next_frame, next_frame);
261    }
262
263    const int select = !!prev_frame << 1 | !!next_frame;
264    switch (select)
265    {
266    default:    /* No neighbors */
267        if (read_offset == DF_END(new_frame))
268        {
269            if (DF_SIZE(new_frame))
270            {
271                if (DF_FIN(new_frame)
272                    && !((ncdi->ncdi_flags & NCDI_FIN_REACHED)
273                            && read_offset == ncdi->ncdi_fin_off))
274                    return INS_FRAME_OVERLAP                    | CASE('E');
275                else
276                    return INS_FRAME_DUP                        | CASE('F');
277            }
278            else if (!DF_FIN(new_frame)
279                     || ((ncdi->ncdi_flags & NCDI_FIN_REACHED)
280                         && read_offset == ncdi->ncdi_fin_off))
281                return INS_FRAME_DUP                            | CASE('G');
282        }
283        goto list_was_empty;
284    case 3:     /* Both left and right neighbors */
285    case 2:     /* Only left neighbor (prev_frame) */
286        if (DF_OFF(prev_frame) == DF_OFF(new_frame)
287            && DF_SIZE(prev_frame) == DF_SIZE(new_frame))
288        {
289            if (!DF_FIN(prev_frame) && DF_FIN(new_frame))
290                return INS_FRAME_OVERLAP                        | CASE('H');
291            else
292                return INS_FRAME_DUP                            | CASE('I');
293        }
294        if (DF_END(prev_frame) > DF_OFF(new_frame))
295            return INS_FRAME_OVERLAP                            | CASE('J');
296        if (select == 2)
297            goto have_prev;
298        /* Fall-through */
299    case 1:     /* Only right neighbor (next_frame) */
300        if (DF_END(new_frame) > DF_OFF(next_frame))
301            return INS_FRAME_OVERLAP                            | CASE('K');
302        break;
303    }
304
305    if (prev_frame)
306    {
307  have_prev:
308        TAILQ_INSERT_AFTER(&ncdi->ncdi_frames_in, prev_frame, new_frame, next_frame);
309        ncdi->ncdi_n_holes += DF_END(prev_frame) != DF_OFF(new_frame);
310        if (next_frame)
311        {
312            ncdi->ncdi_n_holes += DF_END(new_frame) != DF_OFF(next_frame);
313            --ncdi->ncdi_n_holes;
314        }
315    }
316    else
317    {
318        ncdi->ncdi_n_holes += next_frame
319                           && DF_END(new_frame) != DF_OFF(next_frame);
320  list_was_empty:
321        TAILQ_INSERT_HEAD(&ncdi->ncdi_frames_in, new_frame, next_frame);
322    }
323    CHECK_ORDER(ncdi);
324
325    if (DF_FIN(new_frame))
326    {
327        ncdi->ncdi_flags |= NCDI_FIN_SET;
328        ncdi->ncdi_fin_off = DF_END(new_frame);
329        LSQ_DEBUG("FIN set at %"PRIu64, DF_END(new_frame));
330    }
331
332    ++ncdi->ncdi_n_frames;
333    ncdi->ncdi_byteage += DF_SIZE(new_frame);
334    *p_n_frames = count;
335
336    return INS_FRAME_OK                                         | CASE('Z');
337}
338
339
340static int
341check_efficiency (struct nocopy_data_in *ncdi, unsigned count)
342{
343    if (ncdi->ncdi_n_frames <= EFF_CHECK_THRESH_LOW)
344    {
345        ncdi->ncdi_cons_far = 0;
346        return 0;
347    }
348    if (ncdi->ncdi_n_frames > EFF_CHECK_THRESH_HIGH)
349        return 1;
350    if (count >= ncdi->ncdi_n_frames / 2)
351    {
352        ++ncdi->ncdi_cons_far;
353        if (ncdi->ncdi_cons_far > EFF_FAR_TRAVERSE_COUNT)
354            return 1;
355    }
356    else
357        ncdi->ncdi_cons_far = 0;
358    if (ncdi->ncdi_n_holes > EFF_MAX_HOLES)
359        return 1;
360    if (ncdi->ncdi_byteage / EFF_TINY_FRAME_SZ < ncdi->ncdi_n_frames)
361        return 1;
362    return 0;
363}
364
365
366static void
367set_eff_alert (struct nocopy_data_in *ncdi)
368{
369    LSQ_DEBUG("low efficiency: n_frames: %u; n_holes: %u; cons_far: %u; "
370              "byteage: %"PRIu64, ncdi->ncdi_n_frames, ncdi->ncdi_n_holes,
371              ncdi->ncdi_cons_far, ncdi->ncdi_byteage);
372    ncdi->ncdi_data_in.di_flags |= DI_SWITCH_IMPL;
373}
374
375
376static enum ins_frame
377nocopy_di_insert_frame (struct data_in *data_in,
378                        struct stream_frame *new_frame, uint64_t read_offset)
379{
380    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
381    unsigned count;
382    enum ins_frame ins;
383    int ins_case;
384
385    assert(0 == (new_frame->data_frame.df_fin & ~1));
386    ins_case = insert_frame(ncdi, new_frame, read_offset, &count);
387    ins = ins_case & 0xFF;
388    ins_case >>= 8;
389    LSQ_DEBUG("%s: ins: %d (case '%c')", __func__, ins, (char) ins_case);
390    switch (ins)
391    {
392    case INS_FRAME_OK:
393        if (check_efficiency(ncdi, count))
394            set_eff_alert(ncdi);
395        break;
396    case INS_FRAME_DUP:
397    case INS_FRAME_ERR:
398        lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, new_frame->packet_in);
399        lsquic_malo_put(new_frame);
400        break;
401    default:
402        break;
403    }
404
405    return ins;
406}
407
408
409static struct data_frame *
410nocopy_di_get_frame (struct data_in *data_in, uint64_t read_offset)
411{
412    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
413    struct stream_frame *frame = TAILQ_FIRST(&ncdi->ncdi_frames_in);
414    if (frame && frame->data_frame.df_offset +
415                                frame->data_frame.df_read_off == read_offset)
416    {
417        LSQ_DEBUG("get_frame: frame (off: %"PRIu64", size: %u, fin: %d), at "
418            "read offset %"PRIu64, DF_OFF(frame), DF_SIZE(frame), DF_FIN(frame),
419            read_offset);
420        return &frame->data_frame;
421    }
422    else
423    {
424        LSQ_DEBUG("get_frame: no frame at read offset %"PRIu64, read_offset);
425        return NULL;
426    }
427}
428
429
430static void
431nocopy_di_frame_done (struct data_in *data_in, struct data_frame *data_frame)
432{
433    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
434    struct stream_frame *const frame = STREAM_FRAME_PTR(data_frame), *first;
435    assert(data_frame->df_read_off == data_frame->df_size);
436    TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame);
437    first = TAILQ_FIRST(&ncdi->ncdi_frames_in);
438    ncdi->ncdi_n_holes -= first && frame->data_frame.df_offset +
439                    frame->data_frame.df_size != first->data_frame.df_offset;
440    --ncdi->ncdi_n_frames;
441    ncdi->ncdi_byteage -= frame->data_frame.df_size;
442    if (DF_FIN(frame))
443    {
444        ncdi->ncdi_flags |= NCDI_FIN_REACHED;
445        LSQ_DEBUG("FIN has been reached at offset %"PRIu64, DF_END(frame));
446    }
447    LSQ_DEBUG("frame (off: %"PRIu64", size: %u, fin: %d) done",
448                                DF_OFF(frame), DF_SIZE(frame), DF_FIN(frame));
449    lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in);
450    lsquic_malo_put(frame);
451}
452
453
454static int
455nocopy_di_empty (struct data_in *data_in)
456{
457    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
458    return TAILQ_EMPTY(&ncdi->ncdi_frames_in);
459}
460
461
462struct data_in *
463nocopy_di_switch_impl (struct data_in *data_in, uint64_t read_offset)
464{
465    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
466    struct data_in *new_data_in;
467    stream_frame_t *frame;
468    enum ins_frame ins;
469
470    new_data_in = data_in_hash_new(ncdi->ncdi_conn_pub, ncdi->ncdi_stream_id,
471                                                        ncdi->ncdi_byteage);
472    if (!new_data_in)
473        goto end;
474
475    while ((frame = TAILQ_FIRST(&ncdi->ncdi_frames_in)))
476    {
477        TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame);
478        ins = data_in_hash_insert_data_frame(new_data_in, &frame->data_frame,
479                                                                  read_offset);
480        lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in);
481        lsquic_malo_put(frame);
482        if (INS_FRAME_ERR == ins)
483        {
484            new_data_in->di_if->di_destroy(new_data_in);
485            new_data_in = NULL;
486            goto end;
487        }
488    }
489
490  end:
491    data_in->di_if->di_destroy(data_in);
492    return new_data_in;
493}
494
495
496/* This function overestimates amount of memory because some packets are
497 * referenced by more than one stream.  In the usual case, however, I
498 * expect the error not to be large.
499 */
500static size_t
501nocopy_di_mem_used (struct data_in *data_in)
502{
503    struct nocopy_data_in *const ncdi = NCDI_PTR(data_in);
504    const stream_frame_t *frame;
505    size_t size;
506
507    size = sizeof(*data_in);
508    TAILQ_FOREACH(frame, &ncdi->ncdi_frames_in, next_frame)
509        size += lsquic_packet_in_mem_used(frame->packet_in);
510
511    return size;
512}
513
514
515static const struct data_in_iface di_if_nocopy = {
516    .di_destroy      = nocopy_di_destroy,
517    .di_empty        = nocopy_di_empty,
518    .di_frame_done   = nocopy_di_frame_done,
519    .di_get_frame    = nocopy_di_get_frame,
520    .di_insert_frame = nocopy_di_insert_frame,
521    .di_mem_used     = nocopy_di_mem_used,
522    .di_switch_impl  = nocopy_di_switch_impl,
523};
524
525static const struct data_in_iface *di_if_nocopy_ptr = &di_if_nocopy;
526