lsquic_di_nocopy.c revision 5f5d395b
1/* Copyright (c) 2017 - 2018 LiteSpeed Technologies Inc. See LICENSE. */ 2/* 3 * lsquic_di_nocopy.c -- The "no-copy" data in stream. 4 * 5 * Data from packets is not copied: the packets are referenced by stream 6 * frames. When all data from stream frame is read, the frame is released 7 * and packet reference count is decremented, which possibly results in 8 * packet being released as well. 9 * 10 * This approach works well in regular circumstances; there are two scenarios 11 * when it does not: 12 * 13 * A. If we have many out-of-order frames, insertion into the list becomes 14 * expensive. In the degenerate case, we'd have to traverse the whole 15 * list to find appropriate position. 16 * 17 * B. Having many frames ties up resources, as each frame keeps a reference 18 * to the packet that contains it. This is a possible attack vector: 19 * send many one-byte packets; a single hole at the beginning will stop 20 * the server from being able to read the stream, thus tying up resources. 21 * 22 * If we detect that either (A) or (B) is true, we request that the stream 23 * switch to a more robust incoming stream frame handler by setting 24 * DI_SWITCH_IMPL flag. 25 * 26 * For a small number of elements, (A) and (B) do not matter and the checks 27 * are not performed. This number is defined by EFF_CHECK_THRESH_LOW. On 28 * the other side of the spectrum, if the number of frames grows very high, 29 * we want to switch to a more memory-efficient implementation even if (A) 30 * and (B) are not true. EFF_CHECK_THRESH_HIGH defines this threshold. 31 * 32 * Between the low and high thresholds, we detect efficiency problems as 33 * follows. 34 * 35 * To detect (A), we count how many elements we have to traverse during 36 * insertion. If we have to traverse at least half the list 37 * EFF_FAR_TRAVERSE_COUNT in a row, DI_SWITCH_IMPL is issued. 38 * 39 * If average stream frame size is smaller than EFF_TINY_FRAME_SZ bytes, 40 * (B) condition is true. In addition, if there are more than EFF_MAX_HOLES 41 * in the stream, this is also indicative of (B). 42 */ 43 44 45#include <assert.h> 46#include <inttypes.h> 47#include <stddef.h> 48#include <stdint.h> 49#include <stdlib.h> 50#include <sys/queue.h> 51 52#include "lsquic.h" 53#include "lsquic_types.h" 54#include "lsquic_int_types.h" 55#include "lsquic_conn_flow.h" 56#include "lsquic_packet_common.h" 57#include "lsquic_packet_in.h" 58#include "lsquic_rtt.h" 59#include "lsquic_sfcw.h" 60#include "lsquic_stream.h" 61#include "lsquic_mm.h" 62#include "lsquic_malo.h" 63#include "lsquic_conn.h" 64#include "lsquic_conn_public.h" 65#include "lsquic_data_in_if.h" 66 67 68#define LSQUIC_LOGGER_MODULE LSQLM_DI 69#define LSQUIC_LOG_CONN_ID ncdi->ncdi_conn_pub->lconn->cn_cid 70#define LSQUIC_LOG_STREAM_ID ncdi->ncdi_stream_id 71#include "lsquic_logger.h" 72 73 74/* If number of elements is at or below this number, we do not bother to check 75 * efficiency conditions. 76 */ 77#define EFF_CHECK_THRESH_LOW 10 78 79/* If number of elements is higher than this number, efficiency alert 80 * is issued unconditionally. 81 */ 82#define EFF_CHECK_THRESH_HIGH 1000 83 84/* Maximum number of consecutive far traversals */ 85#define EFF_FAR_TRAVERSE_COUNT 4 86 87/* Maximum number of holes that is not deemed suspicious */ 88#define EFF_MAX_HOLES 5 89 90/* What is deemed a tiny frame, in bytes. If it is a power of two, calculation 91 * is cheaper. 92 */ 93#define EFF_TINY_FRAME_SZ 64 94 95 96TAILQ_HEAD(stream_frames_tailq, stream_frame); 97 98 99struct nocopy_data_in 100{ 101 struct stream_frames_tailq ncdi_frames_in; 102 struct data_in ncdi_data_in; 103 struct lsquic_conn_public *ncdi_conn_pub; 104 uint64_t ncdi_byteage; 105 uint64_t ncdi_fin_off; 106 uint32_t ncdi_stream_id; 107 unsigned ncdi_n_frames; 108 unsigned ncdi_n_holes; 109 unsigned ncdi_cons_far; 110 enum { 111 NCDI_FIN_SET = 1 << 0, 112 NCDI_FIN_REACHED = 1 << 1, 113 } ncdi_flags; 114}; 115 116 117#define NCDI_PTR(data_in) (struct nocopy_data_in *) \ 118 ((unsigned char *) (data_in) - offsetof(struct nocopy_data_in, ncdi_data_in)) 119 120#define STREAM_FRAME_PTR(data_frame) (struct stream_frame *) \ 121 ((unsigned char *) (data_frame) - offsetof(struct stream_frame, data_frame)) 122 123 124static const struct data_in_iface *di_if_nocopy_ptr; 125 126 127struct data_in * 128data_in_nocopy_new (struct lsquic_conn_public *conn_pub, uint32_t stream_id) 129{ 130 struct nocopy_data_in *ncdi; 131 132 ncdi = malloc(sizeof(*ncdi)); 133 if (!ncdi) 134 return NULL; 135 136 TAILQ_INIT(&ncdi->ncdi_frames_in); 137 ncdi->ncdi_data_in.di_if = di_if_nocopy_ptr; 138 ncdi->ncdi_data_in.di_flags = 0; 139 ncdi->ncdi_conn_pub = conn_pub; 140 ncdi->ncdi_stream_id = stream_id; 141 ncdi->ncdi_byteage = 0; 142 ncdi->ncdi_n_frames = 0; 143 ncdi->ncdi_n_holes = 0; 144 ncdi->ncdi_cons_far = 0; 145 ncdi->ncdi_fin_off = 0; 146 ncdi->ncdi_flags = 0; 147 LSQ_DEBUG("initialized"); 148 return &ncdi->ncdi_data_in; 149} 150 151 152static void 153nocopy_di_destroy (struct data_in *data_in) 154{ 155 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 156 stream_frame_t *frame; 157 while ((frame = TAILQ_FIRST(&ncdi->ncdi_frames_in))) 158 { 159 TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame); 160 lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in); 161 lsquic_malo_put(frame); 162 } 163 free(ncdi); 164} 165 166 167#define DF_OFF(frame) (frame)->data_frame.df_offset 168#define DF_FIN(frame) (frame)->data_frame.df_fin 169#define DF_SIZE(frame) (frame)->data_frame.df_size 170#define DF_END(frame) (DF_OFF(frame) + DF_SIZE(frame)) 171 172 173#if LSQUIC_EXTRA_CHECKS 174static int 175frame_list_is_sane (const struct nocopy_data_in *ncdi) 176{ 177 const stream_frame_t *frame; 178 uint64_t prev_off = 0, prev_end = 0; 179 int ordered = 1, overlaps = 0; 180 TAILQ_FOREACH(frame, &ncdi->ncdi_frames_in, next_frame) 181 { 182 ordered &= prev_off <= DF_OFF(frame); 183 overlaps |= prev_end > DF_OFF(frame); 184 prev_off = DF_OFF(frame); 185 prev_end = DF_END(frame); 186 } 187 return ordered && !overlaps; 188} 189 190 191#define CHECK_ORDER(ncdi) assert(frame_list_is_sane(ncdi)) 192#else 193#define CHECK_ORDER(ncdi) 194#endif 195 196 197#define CASE(letter) ((int) (letter) << 8) 198 199/* Not all errors are picked up by this function, as it is expensive (and 200 * potentially error-prone) to check for all possible error conditions. 201 * It an error be misclassified as an overlap or dup, in the worst case 202 * we end up with an application error instead of protocol violation. 203 */ 204static int 205insert_frame (struct nocopy_data_in *ncdi, struct stream_frame *new_frame, 206 uint64_t read_offset, unsigned *p_n_frames) 207{ 208 stream_frame_t *prev_frame, *next_frame; 209 unsigned count; 210 211 if (read_offset > DF_END(new_frame)) 212 { 213 if (DF_FIN(new_frame)) 214 return INS_FRAME_ERR | CASE('A'); 215 else 216 return INS_FRAME_DUP | CASE('B'); 217 } 218 219 if (ncdi->ncdi_flags & NCDI_FIN_SET) 220 { 221 if (DF_FIN(new_frame) && DF_END(new_frame) != ncdi->ncdi_fin_off) 222 return INS_FRAME_ERR | CASE('C'); 223 if (DF_END(new_frame) > ncdi->ncdi_fin_off) 224 return INS_FRAME_ERR | CASE('D'); 225 if (read_offset == DF_END(new_frame)) 226 return INS_FRAME_DUP | CASE('M'); 227 } 228 else 229 { 230 if (read_offset == DF_END(new_frame) && !DF_FIN(new_frame)) 231 return INS_FRAME_DUP | CASE('L'); 232 } 233 234 /* Find position in the list, going backwards. We go backwards because 235 * that is the most likely scenario. 236 */ 237 next_frame = TAILQ_LAST(&ncdi->ncdi_frames_in, stream_frames_tailq); 238 if (next_frame && DF_OFF(new_frame) < DF_OFF(next_frame)) 239 { 240 count = 1; 241 prev_frame = TAILQ_PREV(next_frame, stream_frames_tailq, next_frame); 242 for ( ; prev_frame && DF_OFF(new_frame) < DF_OFF(next_frame); 243 next_frame = prev_frame, 244 prev_frame = TAILQ_PREV(prev_frame, stream_frames_tailq, next_frame)) 245 { 246 if (DF_OFF(new_frame) >= DF_OFF(prev_frame)) 247 break; 248 ++count; 249 } 250 } 251 else 252 { 253 count = 0; 254 prev_frame = NULL; 255 } 256 257 if (!prev_frame && next_frame && DF_OFF(new_frame) >= DF_OFF(next_frame)) 258 { 259 prev_frame = next_frame; 260 next_frame = TAILQ_NEXT(next_frame, next_frame); 261 } 262 263 const int select = !!prev_frame << 1 | !!next_frame; 264 switch (select) 265 { 266 default: /* No neighbors */ 267 if (read_offset == DF_END(new_frame)) 268 { 269 if (DF_SIZE(new_frame)) 270 { 271 if (DF_FIN(new_frame) 272 && !((ncdi->ncdi_flags & NCDI_FIN_REACHED) 273 && read_offset == ncdi->ncdi_fin_off)) 274 return INS_FRAME_OVERLAP | CASE('E'); 275 else 276 return INS_FRAME_DUP | CASE('F'); 277 } 278 else if (!DF_FIN(new_frame) 279 || ((ncdi->ncdi_flags & NCDI_FIN_REACHED) 280 && read_offset == ncdi->ncdi_fin_off)) 281 return INS_FRAME_DUP | CASE('G'); 282 } 283 goto list_was_empty; 284 case 3: /* Both left and right neighbors */ 285 case 2: /* Only left neighbor (prev_frame) */ 286 if (DF_OFF(prev_frame) == DF_OFF(new_frame) 287 && DF_SIZE(prev_frame) == DF_SIZE(new_frame)) 288 { 289 if (!DF_FIN(prev_frame) && DF_FIN(new_frame)) 290 return INS_FRAME_OVERLAP | CASE('H'); 291 else 292 return INS_FRAME_DUP | CASE('I'); 293 } 294 if (DF_END(prev_frame) > DF_OFF(new_frame)) 295 return INS_FRAME_OVERLAP | CASE('J'); 296 if (select == 2) 297 goto have_prev; 298 /* Fall-through */ 299 case 1: /* Only right neighbor (next_frame) */ 300 if (DF_END(new_frame) > DF_OFF(next_frame)) 301 return INS_FRAME_OVERLAP | CASE('K'); 302 break; 303 } 304 305 if (prev_frame) 306 { 307 have_prev: 308 TAILQ_INSERT_AFTER(&ncdi->ncdi_frames_in, prev_frame, new_frame, next_frame); 309 ncdi->ncdi_n_holes += DF_END(prev_frame) != DF_OFF(new_frame); 310 if (next_frame) 311 { 312 ncdi->ncdi_n_holes += DF_END(new_frame) != DF_OFF(next_frame); 313 --ncdi->ncdi_n_holes; 314 } 315 } 316 else 317 { 318 ncdi->ncdi_n_holes += next_frame 319 && DF_END(new_frame) != DF_OFF(next_frame); 320 list_was_empty: 321 TAILQ_INSERT_HEAD(&ncdi->ncdi_frames_in, new_frame, next_frame); 322 } 323 CHECK_ORDER(ncdi); 324 325 if (DF_FIN(new_frame)) 326 { 327 ncdi->ncdi_flags |= NCDI_FIN_SET; 328 ncdi->ncdi_fin_off = DF_END(new_frame); 329 LSQ_DEBUG("FIN set at %"PRIu64, DF_END(new_frame)); 330 } 331 332 ++ncdi->ncdi_n_frames; 333 ncdi->ncdi_byteage += DF_SIZE(new_frame); 334 *p_n_frames = count; 335 336 return INS_FRAME_OK | CASE('Z'); 337} 338 339 340static int 341check_efficiency (struct nocopy_data_in *ncdi, unsigned count) 342{ 343 if (ncdi->ncdi_n_frames <= EFF_CHECK_THRESH_LOW) 344 { 345 ncdi->ncdi_cons_far = 0; 346 return 0; 347 } 348 if (ncdi->ncdi_n_frames > EFF_CHECK_THRESH_HIGH) 349 return 1; 350 if (count >= ncdi->ncdi_n_frames / 2) 351 { 352 ++ncdi->ncdi_cons_far; 353 if (ncdi->ncdi_cons_far > EFF_FAR_TRAVERSE_COUNT) 354 return 1; 355 } 356 else 357 ncdi->ncdi_cons_far = 0; 358 if (ncdi->ncdi_n_holes > EFF_MAX_HOLES) 359 return 1; 360 if (ncdi->ncdi_byteage / EFF_TINY_FRAME_SZ < ncdi->ncdi_n_frames) 361 return 1; 362 return 0; 363} 364 365 366static void 367set_eff_alert (struct nocopy_data_in *ncdi) 368{ 369 LSQ_DEBUG("low efficiency: n_frames: %u; n_holes: %u; cons_far: %u; " 370 "byteage: %"PRIu64, ncdi->ncdi_n_frames, ncdi->ncdi_n_holes, 371 ncdi->ncdi_cons_far, ncdi->ncdi_byteage); 372 ncdi->ncdi_data_in.di_flags |= DI_SWITCH_IMPL; 373} 374 375 376static enum ins_frame 377nocopy_di_insert_frame (struct data_in *data_in, 378 struct stream_frame *new_frame, uint64_t read_offset) 379{ 380 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 381 unsigned count; 382 enum ins_frame ins; 383 int ins_case; 384 385 assert(0 == (new_frame->data_frame.df_fin & ~1)); 386 ins_case = insert_frame(ncdi, new_frame, read_offset, &count); 387 ins = ins_case & 0xFF; 388 ins_case >>= 8; 389 LSQ_DEBUG("%s: ins: %d (case '%c')", __func__, ins, (char) ins_case); 390 switch (ins) 391 { 392 case INS_FRAME_OK: 393 if (check_efficiency(ncdi, count)) 394 set_eff_alert(ncdi); 395 break; 396 case INS_FRAME_DUP: 397 case INS_FRAME_ERR: 398 lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, new_frame->packet_in); 399 lsquic_malo_put(new_frame); 400 break; 401 default: 402 break; 403 } 404 405 return ins; 406} 407 408 409static struct data_frame * 410nocopy_di_get_frame (struct data_in *data_in, uint64_t read_offset) 411{ 412 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 413 struct stream_frame *frame = TAILQ_FIRST(&ncdi->ncdi_frames_in); 414 if (frame && frame->data_frame.df_offset + 415 frame->data_frame.df_read_off == read_offset) 416 { 417 LSQ_DEBUG("get_frame: frame (off: %"PRIu64", size: %u, fin: %d), at " 418 "read offset %"PRIu64, DF_OFF(frame), DF_SIZE(frame), DF_FIN(frame), 419 read_offset); 420 return &frame->data_frame; 421 } 422 else 423 { 424 LSQ_DEBUG("get_frame: no frame at read offset %"PRIu64, read_offset); 425 return NULL; 426 } 427} 428 429 430static void 431nocopy_di_frame_done (struct data_in *data_in, struct data_frame *data_frame) 432{ 433 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 434 struct stream_frame *const frame = STREAM_FRAME_PTR(data_frame), *first; 435 assert(data_frame->df_read_off == data_frame->df_size); 436 TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame); 437 first = TAILQ_FIRST(&ncdi->ncdi_frames_in); 438 ncdi->ncdi_n_holes -= first && frame->data_frame.df_offset + 439 frame->data_frame.df_size != first->data_frame.df_offset; 440 --ncdi->ncdi_n_frames; 441 ncdi->ncdi_byteage -= frame->data_frame.df_size; 442 if (DF_FIN(frame)) 443 { 444 ncdi->ncdi_flags |= NCDI_FIN_REACHED; 445 LSQ_DEBUG("FIN has been reached at offset %"PRIu64, DF_END(frame)); 446 } 447 LSQ_DEBUG("frame (off: %"PRIu64", size: %u, fin: %d) done", 448 DF_OFF(frame), DF_SIZE(frame), DF_FIN(frame)); 449 lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in); 450 lsquic_malo_put(frame); 451} 452 453 454static int 455nocopy_di_empty (struct data_in *data_in) 456{ 457 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 458 return TAILQ_EMPTY(&ncdi->ncdi_frames_in); 459} 460 461 462struct data_in * 463nocopy_di_switch_impl (struct data_in *data_in, uint64_t read_offset) 464{ 465 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 466 struct data_in *new_data_in; 467 stream_frame_t *frame; 468 enum ins_frame ins; 469 470 new_data_in = data_in_hash_new(ncdi->ncdi_conn_pub, ncdi->ncdi_stream_id, 471 ncdi->ncdi_byteage); 472 if (!new_data_in) 473 goto end; 474 475 while ((frame = TAILQ_FIRST(&ncdi->ncdi_frames_in))) 476 { 477 TAILQ_REMOVE(&ncdi->ncdi_frames_in, frame, next_frame); 478 ins = data_in_hash_insert_data_frame(new_data_in, &frame->data_frame, 479 read_offset); 480 lsquic_packet_in_put(ncdi->ncdi_conn_pub->mm, frame->packet_in); 481 lsquic_malo_put(frame); 482 if (INS_FRAME_ERR == ins) 483 { 484 new_data_in->di_if->di_destroy(new_data_in); 485 new_data_in = NULL; 486 goto end; 487 } 488 } 489 490 end: 491 data_in->di_if->di_destroy(data_in); 492 return new_data_in; 493} 494 495 496/* This function overestimates amount of memory because some packets are 497 * referenced by more than one stream. In the usual case, however, I 498 * expect the error not to be large. 499 */ 500static size_t 501nocopy_di_mem_used (struct data_in *data_in) 502{ 503 struct nocopy_data_in *const ncdi = NCDI_PTR(data_in); 504 const stream_frame_t *frame; 505 size_t size; 506 507 size = sizeof(*data_in); 508 TAILQ_FOREACH(frame, &ncdi->ncdi_frames_in, next_frame) 509 size += lsquic_packet_in_mem_used(frame->packet_in); 510 511 return size; 512} 513 514 515static const struct data_in_iface di_if_nocopy = { 516 .di_destroy = nocopy_di_destroy, 517 .di_empty = nocopy_di_empty, 518 .di_frame_done = nocopy_di_frame_done, 519 .di_get_frame = nocopy_di_get_frame, 520 .di_insert_frame = nocopy_di_insert_frame, 521 .di_mem_used = nocopy_di_mem_used, 522 .di_switch_impl = nocopy_di_switch_impl, 523}; 524 525static const struct data_in_iface *di_if_nocopy_ptr = &di_if_nocopy; 526