FreeBSD kernel IPv4 code
in_fib_dxr.c
Go to the documentation of this file.
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2012-2022 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure. More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD$");
47
48#include "opt_inet.h"
49
50#include <sys/param.h>
51#include <sys/kernel.h>
52#include <sys/ctype.h>
53#include <sys/epoch.h>
54#include <sys/malloc.h>
55#include <sys/module.h>
56#include <sys/socket.h>
57#include <sys/sysctl.h>
58#include <sys/syslog.h>
59
60#include <vm/uma.h>
61
62#include <netinet/in.h>
63#include <netinet/in_fib.h>
64
65#include <net/route.h>
66#include <net/route/route_ctl.h>
67#include <net/route/fib_algo.h>
68
69#define DXR_TRIE_BITS 20
70
72
73/* DXR2: two-stage primary trie, instead of a single direct lookup table */
74#define DXR2
75
76#if DXR_TRIE_BITS > 16
77#define DXR_D 16
78#else
79#define DXR_D (DXR_TRIE_BITS - 1)
80#endif
81
82#define D_TBL_SIZE (1 << DXR_D)
83#define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
84#define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
85#define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
86
87#define DESC_BASE_BITS 22
88#define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
89#define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
90#define RTBL_SIZE_INCR (BASE_MAX / 64)
91
92#if DXR_TRIE_BITS < 24
93#define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
94#else
95#define FRAGS_MASK_SHORT 0
96#endif
97#define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
98 ~FRAGS_MASK_SHORT)
99#define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
100#define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
101
102#define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103#define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104#define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
105
106#define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
107
108#define CHUNK_HASH_BITS 16
109#define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
110#define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
111
112#define TRIE_HASH_BITS 16
113#define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
114#define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
115
116#define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
117
118#define UNUSED_BUCKETS 8
119
120/* Lookup structure elements */
121
125};
126
130};
131
132#if DXR_TRIE_BITS < 24
136};
137#endif
138
139/* Auxiliary structures */
140
146};
147
149 LIST_ENTRY(chunk_desc) cd_all_le;
150 LIST_ENTRY(chunk_desc) cd_hash_le;
151 uint32_t cd_hash;
152 uint32_t cd_refcnt;
153 uint32_t cd_base;
154 uint32_t cd_cur_size;
155 uint32_t cd_max_size;
156};
157
158struct trie_desc {
159 LIST_ENTRY(trie_desc) td_all_le;
160 LIST_ENTRY(trie_desc) td_hash_le;
161 uint32_t td_hash;
162 uint32_t td_index;
163 uint32_t td_refcnt;
164};
165
166struct dxr_aux {
167 /* Glue to external state */
168 struct fib_data *fd;
171
172 /* Auxiliary build-time tables */
176 union {
180
181 /* Auxiliary internal state */
184 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
185 LIST_HEAD(, chunk_desc) all_chunks;
186 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
187 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
188 LIST_HEAD(, trie_desc) all_trie;
189 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
190 struct sockaddr_in dst;
191 struct sockaddr_in mask;
192 struct heap_entry heap[33];
193 uint32_t prefixes;
194 uint32_t updates_low;
195 uint32_t updates_high;
196 uint32_t unused_chunks_size;
197 uint32_t xtbl_size;
198 uint32_t all_trie_cnt;
199 uint32_t unused_trie_cnt;
200 uint32_t trie_rebuilt_prefixes;
201 uint32_t heap_index;
202 uint32_t d_bits;
203 uint32_t rtbl_size;
204 uint32_t rtbl_top;
205 uint32_t rtbl_work_frags;
206 uint32_t work_chunk;
207};
208
209/* Main lookup structure container */
210
211struct dxr {
212 /* Lookup tables */
213 void *d;
214 void *x;
215 void *r;
216 struct nhop_object **nh_tbl;
217
218 /* Glue to external state */
219 struct dxr_aux *aux;
220 struct fib_data *fd;
221 struct epoch_context epoch_ctx;
226};
227
228static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
229static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
230
231uma_zone_t chunk_zone;
232uma_zone_t trie_zone;
233
234VNET_DEFINE_STATIC(int, frag_limit) = 100;
235#define V_frag_limit VNET(frag_limit)
236
237/* Binary search for a matching address range */
238#define DXR_LOOKUP_STAGE \
239 if (masked_dst < range[middle].start) { \
240 upperbound = middle; \
241 middle = (middle + lowerbound) / 2; \
242 } else if (masked_dst < range[middle + 1].start) \
243 return (range[middle].nexthop); \
244 else { \
245 lowerbound = middle + 1; \
246 middle = (upperbound + middle + 1) / 2; \
247 } \
248 if (upperbound == lowerbound) \
249 return (range[lowerbound].nexthop);
250
251static int
253{
254 uint32_t base;
255 uint32_t upperbound;
256 uint32_t middle;
257 uint32_t lowerbound;
258 uint32_t masked_dst;
259
260 base = de.base;
261 lowerbound = 0;
262 masked_dst = dst & DXR_RANGE_MASK;
263
264#if DXR_TRIE_BITS < 24
265 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
266 upperbound = de.fragments & FRAGS_MASK_SHORT;
267 struct range_entry_short *range =
268 (struct range_entry_short *) &rt[base];
269
270 masked_dst >>= 8;
271 middle = upperbound;
272 upperbound = upperbound * 2 + 1;
273
274 for (;;) {
277 }
278 }
279#endif
280
281 upperbound = de.fragments;
282 middle = upperbound / 2;
283 struct range_entry_long *range = &rt[base];
284 if (__predict_false(IS_XL_FORMAT(de.fragments))) {
285 upperbound = *((uint32_t *) range);
286 range++;
287 middle = upperbound / 2;
288 }
289
290 for (;;) {
293 }
294}
295
296#define DXR_LOOKUP_DEFINE(D) \
297 static int inline \
298 dxr_lookup_##D(struct dxr *dxr, uint32_t dst) \
299 { \
300 struct direct_entry de; \
301 uint16_t *dt = dxr->d; \
302 struct direct_entry *xt = dxr->x; \
303 \
304 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
305 + ((dst >> DXR_RANGE_SHIFT) & \
306 (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))]; \
307 if (__predict_true(de.fragments == FRAGS_MARK_HIT)) \
308 return (de.base); \
309 return (range_lookup(dxr->r, de, dst)); \
310 } \
311 \
312 static struct nhop_object * \
313 dxr_fib_lookup_##D(void *algo_data, \
314 const struct flm_lookup_key key, uint32_t scopeid __unused) \
315 { \
316 struct dxr *dxr = algo_data; \
317 \
318 return (dxr->nh_tbl[dxr_lookup_##D(dxr, \
319 ntohl(key.addr4.s_addr))]); \
320 }
321
322#ifdef DXR2
323#if DXR_TRIE_BITS > 16
325#endif
333#endif /* DXR2 */
334
335static int inline
337{
338 struct direct_entry de;
339#ifdef DXR2
340 uint16_t *dt = dxr->d;
341 struct direct_entry *xt = dxr->x;
342
343 de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
344 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
345#else /* !DXR2 */
346 struct direct_entry *dt = dxr->d;
347
348 de = dt[dst >> DXR_RANGE_SHIFT];
349#endif /* !DXR2 */
350 if (__predict_true(de.fragments == FRAGS_MARK_HIT))
351 return (de.base);
352 return (range_lookup(dxr->r, de, dst));
353}
354
355static void
356initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
357{
358 struct heap_entry *fhp = &da->heap[0];
359 struct rtentry *rt;
360 struct route_nhop_data rnd;
361
362 da->heap_index = 0;
363 da->dst.sin_addr.s_addr = htonl(dst_u32);
364 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
365 &rnd);
366 if (rt != NULL) {
367 struct in_addr addr;
368 uint32_t scopeid;
369
370 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
371 fhp->start = ntohl(addr.s_addr);
372 fhp->end = fhp->start;
373 if (fhp->preflen < 32)
374 fhp->end |= (0xffffffffU >> fhp->preflen);
375 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
376 } else {
377 fhp->preflen = fhp->nexthop = fhp->start = 0;
378 fhp->end = 0xffffffffU;
379 }
380}
381
382static uint32_t
383chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
384{
385
386 if (IS_SHORT_FORMAT(fdesc->fragments))
387 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
388 else if (IS_XL_FORMAT(fdesc->fragments))
389 return (da->range_tbl[fdesc->base].fragments + 2);
390 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
391 return (fdesc->fragments + 1);
392}
393
394static uint32_t
395chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
396{
397 uint32_t size = chunk_size(da, fdesc);
398 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
399 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
400 uint32_t hash = fdesc->fragments;
401
402 for (; p < l; p++)
403 hash = (hash << 7) + (hash >> 13) + *p;
404
405 return (hash + (hash >> 16));
406}
407
408static int
409chunk_ref(struct dxr_aux *da, uint32_t chunk)
410{
411 struct direct_entry *fdesc = &da->direct_tbl[chunk];
412 struct chunk_desc *cdp, *empty_cdp;
413 uint32_t base = fdesc->base;
414 uint32_t size = chunk_size(da, fdesc);
415 uint32_t hash = chunk_hash(da, fdesc);
416 int i;
417
418 /* Find an existing descriptor */
419 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
420 cd_hash_le) {
421 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
422 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
423 sizeof(struct range_entry_long) * size))
424 continue;
425 da->rtbl_top = fdesc->base;
426 fdesc->base = cdp->cd_base;
427 cdp->cd_refcnt++;
428 return (0);
429 }
430
431 /* No matching chunks found. Find an empty one to recycle. */
432 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
433 cdp = LIST_FIRST(&da->unused_chunks[i]);
434
435 if (cdp == NULL)
436 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
437 if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
438 empty_cdp->cd_max_size < cdp->cd_max_size)) {
439 cdp = empty_cdp;
440 if (empty_cdp->cd_max_size == size)
441 break;
442 }
443
444 if (cdp != NULL) {
445 /* Copy from heap into the recycled chunk */
446 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
447 size * sizeof(struct range_entry_long));
448 fdesc->base = cdp->cd_base;
449 da->rtbl_top -= size;
450 da->unused_chunks_size -= cdp->cd_max_size;
451 if (cdp->cd_max_size > size) {
452 /* Split the range in two, need a new descriptor */
453 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
454 if (empty_cdp == NULL)
455 return (1);
456 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
457 empty_cdp->cd_base = cdp->cd_base + size;
458 empty_cdp->cd_cur_size = 0;
459 empty_cdp->cd_max_size = cdp->cd_max_size - size;
460
461 i = empty_cdp->cd_max_size;
462 if (i >= UNUSED_BUCKETS)
463 i = 0;
464 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
465 cd_hash_le);
466
467 da->unused_chunks_size += empty_cdp->cd_max_size;
468 cdp->cd_max_size = size;
469 }
470 LIST_REMOVE(cdp, cd_hash_le);
471 } else {
472 /* Alloc a new descriptor at the top of the heap*/
473 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
474 if (cdp == NULL)
475 return (1);
476 cdp->cd_max_size = size;
477 cdp->cd_base = fdesc->base;
478 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
479 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
480 ("dxr: %s %d", __FUNCTION__, __LINE__));
481 }
482
483 cdp->cd_hash = hash;
484 cdp->cd_refcnt = 1;
485 cdp->cd_cur_size = size;
486 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
487 cd_hash_le);
488 if (da->rtbl_top >= da->rtbl_size) {
489 if (da->rtbl_top >= BASE_MAX) {
490 FIB_PRINTF(LOG_ERR, da->fd,
491 "structural limit exceeded at %d "
492 "range table elements", da->rtbl_top);
493 return (1);
494 }
495 da->rtbl_size += RTBL_SIZE_INCR;
496 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
497 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
498 da->rtbl_top * 100 / BASE_MAX);
499 da->range_tbl = realloc(da->range_tbl,
500 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
501 M_DXRAUX, M_NOWAIT);
502 if (da->range_tbl == NULL)
503 return (1);
504 }
505
506 return (0);
507}
508
509static void
510chunk_unref(struct dxr_aux *da, uint32_t chunk)
511{
512 struct direct_entry *fdesc = &da->direct_tbl[chunk];
513 struct chunk_desc *cdp, *cdp2;
514 uint32_t base = fdesc->base;
515 uint32_t size = chunk_size(da, fdesc);
516 uint32_t hash = chunk_hash(da, fdesc);
517 int i;
518
519 /* Find the corresponding descriptor */
520 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
521 cd_hash_le)
522 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
523 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
524 sizeof(struct range_entry_long) * size) == 0)
525 break;
526
527 KASSERT(cdp != NULL, ("dxr: dangling chunk"));
528 if (--cdp->cd_refcnt > 0)
529 return;
530
531 LIST_REMOVE(cdp, cd_hash_le);
532 da->unused_chunks_size += cdp->cd_max_size;
533 cdp->cd_cur_size = 0;
534
535 /* Attempt to merge with the preceding chunk, if empty */
536 cdp2 = LIST_NEXT(cdp, cd_all_le);
537 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
539 ("dxr: %s %d", __FUNCTION__, __LINE__));
540 LIST_REMOVE(cdp, cd_all_le);
541 LIST_REMOVE(cdp2, cd_hash_le);
542 cdp2->cd_max_size += cdp->cd_max_size;
543 uma_zfree(chunk_zone, cdp);
544 cdp = cdp2;
545 }
546
547 /* Attempt to merge with the subsequent chunk, if empty */
548 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
549 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
550 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
551 ("dxr: %s %d", __FUNCTION__, __LINE__));
552 LIST_REMOVE(cdp, cd_all_le);
553 LIST_REMOVE(cdp2, cd_hash_le);
554 cdp2->cd_max_size += cdp->cd_max_size;
555 cdp2->cd_base = cdp->cd_base;
556 uma_zfree(chunk_zone, cdp);
557 cdp = cdp2;
558 }
559
560 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
561 /* Free the chunk on the top of the range heap, trim the heap */
562 KASSERT(cdp == LIST_FIRST(&da->all_chunks),
563 ("dxr: %s %d", __FUNCTION__, __LINE__));
564 da->rtbl_top -= cdp->cd_max_size;
565 da->unused_chunks_size -= cdp->cd_max_size;
566 LIST_REMOVE(cdp, cd_all_le);
567 uma_zfree(chunk_zone, cdp);
568 return;
569 }
570
571 i = cdp->cd_max_size;
572 if (i >= UNUSED_BUCKETS)
573 i = 0;
574 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
575}
576
577#ifdef DXR2
578static uint32_t
579trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
580{
581 uint32_t i, *val;
582 uint32_t hash = 0;
583
584 for (i = 0; i < (1 << dxr_x); i++) {
585 hash = (hash << 3) ^ (hash >> 3);
586 val = (uint32_t *)
587 (void *) &da->direct_tbl[(index << dxr_x) + i];
588 hash += (*val << 5);
589 hash += (*val >> 5);
590 }
591
592 return (hash + (hash >> 16));
593}
594
595static int
596trie_ref(struct dxr_aux *da, uint32_t index)
597{
598 struct trie_desc *tp;
599 uint32_t dxr_d = da->d_bits;
600 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
601 uint32_t hash = trie_hash(da, dxr_x, index);
602
603 /* Find an existing descriptor */
604 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
605 if (tp->td_hash == hash &&
606 memcmp(&da->direct_tbl[index << dxr_x],
607 &da->x_tbl[tp->td_index << dxr_x],
608 sizeof(*da->x_tbl) << dxr_x) == 0) {
609 tp->td_refcnt++;
610 da->trietbl[index] = tp;
611 return(tp->td_index);
612 }
613
614 tp = LIST_FIRST(&da->unused_trie);
615 if (tp != NULL) {
616 LIST_REMOVE(tp, td_hash_le);
617 da->unused_trie_cnt--;
618 } else {
619 tp = uma_zalloc(trie_zone, M_NOWAIT);
620 if (tp == NULL)
621 return (-1);
622 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
623 tp->td_index = da->all_trie_cnt++;
624 }
625
626 tp->td_hash = hash;
627 tp->td_refcnt = 1;
628 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
629 td_hash_le);
630 memcpy(&da->x_tbl[tp->td_index << dxr_x],
631 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
632 da->trietbl[index] = tp;
633 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
634 da->xtbl_size += XTBL_SIZE_INCR;
635 da->x_tbl = realloc(da->x_tbl,
636 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
637 if (da->x_tbl == NULL)
638 return (-1);
639 }
640 return(tp->td_index);
641}
642
643static void
644trie_unref(struct dxr_aux *da, uint32_t index)
645{
646 struct trie_desc *tp = da->trietbl[index];
647
648 if (tp == NULL)
649 return;
650 da->trietbl[index] = NULL;
651 if (--tp->td_refcnt > 0)
652 return;
653
654 LIST_REMOVE(tp, td_hash_le);
655 da->unused_trie_cnt++;
656 if (tp->td_index != da->all_trie_cnt - 1) {
657 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
658 return;
659 }
660
661 do {
662 da->all_trie_cnt--;
663 da->unused_trie_cnt--;
664 LIST_REMOVE(tp, td_all_le);
665 uma_zfree(trie_zone, tp);
666 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
667 if (tp->td_index == da->all_trie_cnt - 1) {
668 LIST_REMOVE(tp, td_hash_le);
669 break;
670 }
671 } while (tp != NULL);
672}
673#endif
674
675static void
676heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
677 uint32_t nh)
678{
679 struct heap_entry *fhp;
680 int i;
681
682 for (i = da->heap_index; i >= 0; i--) {
683 if (preflen > da->heap[i].preflen)
684 break;
685 else if (preflen < da->heap[i].preflen)
686 da->heap[i + 1] = da->heap[i];
687 else
688 return;
689 }
690
691 fhp = &da->heap[i + 1];
692 fhp->preflen = preflen;
693 fhp->start = start;
694 fhp->end = end;
695 fhp->nexthop = nh;
696 da->heap_index++;
697}
698
699static int
700dxr_walk(struct rtentry *rt, void *arg)
701{
702 struct dxr_aux *da = arg;
703 uint32_t chunk = da->work_chunk;
704 uint32_t first = chunk << DXR_RANGE_SHIFT;
705 uint32_t last = first | DXR_RANGE_MASK;
706 struct range_entry_long *fp =
707 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
708 struct heap_entry *fhp = &da->heap[da->heap_index];
709 uint32_t preflen, nh, start, end, scopeid;
710 struct in_addr addr;
711
712 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
713 start = ntohl(addr.s_addr);
714 if (start > last)
715 return (-1); /* Beyond chunk boundaries, we are done */
716 if (start < first)
717 return (0); /* Skip this route */
718
719 end = start;
720 if (preflen < 32)
721 end |= (0xffffffffU >> preflen);
722 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
723
724 if (start == fhp->start)
725 heap_inject(da, start, end, preflen, nh);
726 else {
727 /* start > fhp->start */
728 while (start > fhp->end) {
729 uint32_t oend = fhp->end;
730
731 if (da->heap_index > 0) {
732 fhp--;
733 da->heap_index--;
734 } else
735 initheap(da, fhp->end + 1, chunk);
736 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
737 fp++;
738 da->rtbl_work_frags++;
739 fp->start = (oend + 1) & DXR_RANGE_MASK;
740 fp->nexthop = fhp->nexthop;
741 }
742 }
743 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
744 nh != fp->nexthop) {
745 fp++;
746 da->rtbl_work_frags++;
747 fp->start = start & DXR_RANGE_MASK;
748 } else if (da->rtbl_work_frags) {
749 if ((--fp)->nexthop == nh)
750 da->rtbl_work_frags--;
751 else
752 fp++;
753 }
754 fp->nexthop = nh;
755 heap_inject(da, start, end, preflen, nh);
756 }
757
758 return (0);
759}
760
761static int
762update_chunk(struct dxr_aux *da, uint32_t chunk)
763{
764 struct range_entry_long *fp;
765#if DXR_TRIE_BITS < 24
766 struct range_entry_short *fps;
767 uint32_t start, nh, i;
768#endif
769 struct heap_entry *fhp;
770 uint32_t first = chunk << DXR_RANGE_SHIFT;
771 uint32_t last = first | DXR_RANGE_MASK;
772
773 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
774 chunk_unref(da, chunk);
775
776 initheap(da, first, chunk);
777
778 fp = &da->range_tbl[da->rtbl_top].re;
779 da->rtbl_work_frags = 0;
780 fp->start = first & DXR_RANGE_MASK;
781 fp->nexthop = da->heap[0].nexthop;
782
783 da->dst.sin_addr.s_addr = htonl(first);
784 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
785
786 da->work_chunk = chunk;
787 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
788 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
789 dxr_walk, da);
790
791 /* Flush any remaining objects on the heap */
792 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
793 fhp = &da->heap[da->heap_index];
794 while (fhp->preflen > DXR_TRIE_BITS) {
795 uint32_t oend = fhp->end;
796
797 if (da->heap_index > 0) {
798 fhp--;
799 da->heap_index--;
800 } else
801 initheap(da, fhp->end + 1, chunk);
802 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
803 /* Have we crossed the upper chunk boundary? */
804 if (oend >= last)
805 break;
806 fp++;
807 da->rtbl_work_frags++;
808 fp->start = (oend + 1) & DXR_RANGE_MASK;
809 fp->nexthop = fhp->nexthop;
810 }
811 }
812
813 /* Direct hit if the chunk contains only a single fragment */
814 if (da->rtbl_work_frags == 0) {
815 da->direct_tbl[chunk].base = fp->nexthop;
817 return (0);
818 }
819
820 da->direct_tbl[chunk].base = da->rtbl_top;
821 da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
822
823#if DXR_TRIE_BITS < 24
824 /* Check whether the chunk can be more compactly encoded */
825 fp = &da->range_tbl[da->rtbl_top].re;
826 for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
827 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
828 break;
829 if (i == da->rtbl_work_frags + 1) {
830 fp = &da->range_tbl[da->rtbl_top].re;
831 fps = (void *) fp;
832 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
833 start = fp->start;
834 nh = fp->nexthop;
835 fps->start = start >> 8;
836 fps->nexthop = nh;
837 }
838 fps->start = start >> 8;
839 fps->nexthop = nh;
840 da->rtbl_work_frags >>= 1;
841 da->direct_tbl[chunk].fragments =
842 da->rtbl_work_frags | FRAGS_PREF_SHORT;
843 } else
844#endif
845 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
847 memmove(&da->range_tbl[da->rtbl_top + 1],
848 &da->range_tbl[da->rtbl_top],
849 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
850 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
851 da->rtbl_work_frags++;
852 }
853 da->rtbl_top += (da->rtbl_work_frags + 1);
854 return (chunk_ref(da, chunk));
855}
856
857static void
859{
860 struct dxr_aux *da = dxr->aux;
861 struct chunk_desc *cdp;
862 struct rib_rtable_info rinfo;
863 struct timeval t0, t1, t2, t3;
864 uint32_t r_size, dxr_tot_size;
865 uint32_t i, m, range_rebuild = 0;
866 uint32_t range_frag;
867#ifdef DXR2
868 struct trie_desc *tp;
869 uint32_t d_tbl_size, dxr_x, d_size, x_size;
870 uint32_t ti, trie_rebuild = 0, prev_size = 0;
871 uint32_t trie_frag;
872#endif
873
874 KASSERT(dxr->d == NULL, ("dxr: d not free"));
875
876 if (da == NULL) {
877 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
878 if (da == NULL)
879 return;
880 dxr->aux = da;
881 da->fibnum = dxr->fibnum;
882 da->refcnt = 1;
883 LIST_INIT(&da->all_chunks);
884 LIST_INIT(&da->all_trie);
885 da->rtbl_size = RTBL_SIZE_INCR;
886 da->range_tbl = NULL;
887 da->xtbl_size = XTBL_SIZE_INCR;
888 da->x_tbl = NULL;
889 bzero(&da->dst, sizeof(da->dst));
890 bzero(&da->mask, sizeof(da->mask));
891 da->dst.sin_len = sizeof(da->dst);
892 da->mask.sin_len = sizeof(da->mask);
893 da->dst.sin_family = AF_INET;
894 da->mask.sin_family = AF_INET;
895 }
896 if (da->range_tbl == NULL) {
897 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
898 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
899 if (da->range_tbl == NULL)
900 return;
901 range_rebuild = 1;
902 }
903#ifdef DXR2
904 if (da->x_tbl == NULL) {
905 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
906 M_DXRAUX, M_NOWAIT);
907 if (da->x_tbl == NULL)
908 return;
909 trie_rebuild = 1;
910 }
911#endif
912 da->fd = dxr->fd;
913
914 microuptime(&t0);
915
916 dxr->nh_tbl = fib_get_nhop_array(da->fd);
917 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
918
919 if (da->updates_low > da->updates_high)
920 range_rebuild = 1;
921
922range_build:
923 if (range_rebuild) {
924 /* Bulk cleanup */
925 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
926 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
927 LIST_REMOVE(cdp, cd_all_le);
928 uma_zfree(chunk_zone, cdp);
929 }
930 for (i = 0; i < UNUSED_BUCKETS; i++)
931 LIST_INIT(&da->unused_chunks[i]);
932 da->unused_chunks_size = 0;
933 da->rtbl_top = 0;
934 da->updates_low = 0;
935 da->updates_high = DIRECT_TBL_SIZE - 1;
936 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
937 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
939 da->direct_tbl[i].base = 0;
940 }
941 }
942 da->prefixes = rinfo.num_prefixes;
943
944 /* DXR: construct direct & range table */
945 for (i = da->updates_low; i <= da->updates_high; i++) {
946 m = da->updates_mask[i >> 5] >> (i & 0x1f);
947 if (m == 0)
948 i |= 0x1f;
949 else if (m & 1 && update_chunk(da, i) != 0)
950 return;
951 }
952
953 range_frag = 0;
954 if (da->rtbl_top)
955 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
956 if (range_frag > V_frag_limit) {
957 range_rebuild = 1;
958 goto range_build;
959 }
960
961 r_size = sizeof(*da->range_tbl) * da->rtbl_top;
962 microuptime(&t1);
963
964#ifdef DXR2
965 if (range_rebuild ||
966 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
967 trie_rebuild = 1;
968
969trie_build:
970 if (trie_rebuild) {
971 da->trie_rebuilt_prefixes = da->prefixes;
972 da->d_bits = DXR_D;
973 da->updates_low = 0;
974 da->updates_high = DIRECT_TBL_SIZE - 1;
975 if (!range_rebuild)
976 memset(da->updates_mask, 0xff,
977 sizeof(da->updates_mask));
978 }
979
980dxr2_try_squeeze:
981 if (trie_rebuild) {
982 /* Bulk cleanup */
983 bzero(da->trietbl, sizeof(da->trietbl));
984 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
985 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
986 LIST_REMOVE(tp, td_all_le);
987 uma_zfree(trie_zone, tp);
988 }
989 LIST_INIT(&da->unused_trie);
990 da->all_trie_cnt = da->unused_trie_cnt = 0;
991 }
992
993 /* Populate d_tbl, x_tbl */
994 dxr_x = DXR_TRIE_BITS - da->d_bits;
995 d_tbl_size = (1 << da->d_bits);
996
997 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
998 i++) {
999 if (!trie_rebuild) {
1000 m = 0;
1001 for (int j = 0; j < (1 << dxr_x); j += 32)
1002 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1003 if (m == 0)
1004 continue;
1005 trie_unref(da, i);
1006 }
1007 ti = trie_ref(da, i);
1008 if (ti < 0)
1009 return;
1010 da->d_tbl[i] = ti;
1011 }
1012
1013 trie_frag = 0;
1014 if (da->all_trie_cnt)
1015 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1016 if (trie_frag > V_frag_limit) {
1017 trie_rebuild = 1;
1018 goto trie_build;
1019 }
1020
1021 d_size = sizeof(*da->d_tbl) * d_tbl_size;
1022 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1023 * da->all_trie_cnt;
1024 dxr_tot_size = d_size + x_size + r_size;
1025
1026 if (trie_rebuild == 1) {
1027 /* Try to find a more compact D/X split */
1028 if (prev_size == 0 || dxr_tot_size <= prev_size)
1029 da->d_bits--;
1030 else {
1031 da->d_bits++;
1032 trie_rebuild = 2;
1033 }
1034 prev_size = dxr_tot_size;
1035 goto dxr2_try_squeeze;
1036 }
1037 microuptime(&t2);
1038#else /* !DXR2 */
1039 dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1040 t2 = t1;
1041#endif
1042
1043 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1044 if (dxr->d == NULL)
1045 return;
1046#ifdef DXR2
1047 memcpy(dxr->d, da->d_tbl, d_size);
1048 dxr->x = ((char *) dxr->d) + d_size;
1049 memcpy(dxr->x, da->x_tbl, x_size);
1050 dxr->r = ((char *) dxr->x) + x_size;
1051 dxr->d_shift = 32 - da->d_bits;
1052 dxr->x_shift = dxr_x;
1053 dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1054#else /* !DXR2 */
1055 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1056 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1057#endif
1058 memcpy(dxr->r, da->range_tbl, r_size);
1059
1060 if (da->updates_low <= da->updates_high)
1061 bzero(&da->updates_mask[da->updates_low / 32],
1062 (da->updates_high - da->updates_low) / 8 + 1);
1063 da->updates_low = DIRECT_TBL_SIZE - 1;
1064 da->updates_high = 0;
1065 microuptime(&t3);
1066
1067#ifdef DXR2
1068 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1069 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1070#else
1071 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1072 DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1073#endif
1074 i = dxr_tot_size * 100;
1075 if (rinfo.num_prefixes)
1076 i /= rinfo.num_prefixes;
1077 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1078 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1079 i / 100, i % 100);
1080#ifdef DXR2
1081 FIB_PRINTF(LOG_INFO, da->fd,
1082 "%d.%02d%% trie, %d.%02d%% range fragmentation",
1083 trie_frag / 100, trie_frag % 100,
1084 range_frag / 100, range_frag % 100);
1085#else
1086 FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1087 range_frag / 100, range_frag % 100);
1088#endif
1089 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1090 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1091 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1092#ifdef DXR2
1093 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1094 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1095 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1096#endif
1097 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1098 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1099 i / 1000, i % 1000);
1100}
1101
1102/*
1103 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1104 */
1105
1106static struct nhop_object *
1107dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1108 uint32_t scopeid)
1109{
1110 struct dxr *dxr = algo_data;
1111
1112 return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1113}
1114
1115static enum flm_op_result
1116dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1117{
1118 struct dxr *old_dxr = old_data;
1119 struct dxr_aux *da = NULL;
1120 struct dxr *dxr;
1121
1122 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1123 if (dxr == NULL)
1124 return (FLM_REBUILD);
1125
1126 /* Check whether we may reuse the old auxiliary structures */
1127 if (old_dxr != NULL && old_dxr->aux != NULL) {
1128 da = old_dxr->aux;
1129 atomic_add_int(&da->refcnt, 1);
1130 }
1131
1132 dxr->aux = da;
1133 dxr->d = NULL;
1134 dxr->fd = fd;
1135 dxr->fibnum = fibnum;
1136 *data = dxr;
1137
1138 return (FLM_SUCCESS);
1139}
1140
1141static void
1142dxr_destroy(void *data)
1143{
1144 struct dxr *dxr = data;
1145 struct dxr_aux *da;
1146 struct chunk_desc *cdp;
1147 struct trie_desc *tp;
1148
1149 if (dxr->d != NULL)
1150 free(dxr->d, M_DXRLPM);
1151
1152 da = dxr->aux;
1153 free(dxr, M_DXRAUX);
1154
1155 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1156 return;
1157
1158 /* Release auxiliary structures */
1159 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1160 LIST_REMOVE(cdp, cd_all_le);
1161 uma_zfree(chunk_zone, cdp);
1162 }
1163 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1164 LIST_REMOVE(tp, td_all_le);
1165 uma_zfree(trie_zone, tp);
1166 }
1167 free(da->range_tbl, M_DXRAUX);
1168 free(da->x_tbl, M_DXRAUX);
1169 free(da, M_DXRAUX);
1170}
1171
1172static void
1173epoch_dxr_destroy(epoch_context_t ctx)
1174{
1175 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1176
1178}
1179
1180static void *
1182{
1183
1184#ifdef DXR2
1185 switch (da->d_bits) {
1186#if DXR_TRIE_BITS > 16
1187 case 16:
1188 return (dxr_fib_lookup_16);
1189#endif
1190 case 15:
1191 return (dxr_fib_lookup_15);
1192 case 14:
1193 return (dxr_fib_lookup_14);
1194 case 13:
1195 return (dxr_fib_lookup_13);
1196 case 12:
1197 return (dxr_fib_lookup_12);
1198 case 11:
1199 return (dxr_fib_lookup_11);
1200 case 10:
1201 return (dxr_fib_lookup_10);
1202 case 9:
1203 return (dxr_fib_lookup_9);
1204 }
1205#endif /* DXR2 */
1206 return (dxr_fib_lookup);
1207}
1208
1209static enum flm_op_result
1210dxr_dump_end(void *data, struct fib_dp *dp)
1211{
1212 struct dxr *dxr = data;
1213 struct dxr_aux *da;
1214
1215 dxr_build(dxr);
1216
1217 da = dxr->aux;
1218 if (da == NULL)
1219 return (FLM_REBUILD);
1220
1221 /* Structural limit exceeded, hard error */
1222 if (da->rtbl_top >= BASE_MAX)
1223 return (FLM_ERROR);
1224
1225 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1226 if (dxr->d == NULL)
1227 return (FLM_REBUILD);
1228
1229 dp->f = choose_lookup_fn(da);
1230 dp->arg = dxr;
1231
1232 return (FLM_SUCCESS);
1233}
1234
1235static enum flm_op_result
1236dxr_dump_rib_item(struct rtentry *rt, void *data)
1237{
1238
1239 return (FLM_SUCCESS);
1240}
1241
1242static enum flm_op_result
1243dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1244 void *data)
1245{
1246
1247 return (FLM_BATCH);
1248}
1249
1250static enum flm_op_result
1251dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1252 void *data)
1253{
1254 struct dxr *dxr = data;
1255 struct dxr *new_dxr;
1256 struct dxr_aux *da;
1257 struct fib_dp new_dp;
1258 enum flm_op_result res;
1259 uint32_t ip, plen, hmask, start, end, i, ui;
1260#ifdef INVARIANTS
1261 struct rib_rtable_info rinfo;
1262 int update_delta = 0;
1263#endif
1264
1265 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1266 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1267 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1268 __FUNCTION__, q->count, q->size));
1269
1270 da = dxr->aux;
1271 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1272
1273 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1274 for (ui = 0; ui < q->count; ui++) {
1275#ifdef INVARIANTS
1276 if (q->entries[ui].nh_new != NULL)
1277 update_delta++;
1278 if (q->entries[ui].nh_old != NULL)
1279 update_delta--;
1280#endif
1281 plen = q->entries[ui].plen;
1282 ip = ntohl(q->entries[ui].addr4.s_addr);
1283 if (plen < 32)
1284 hmask = 0xffffffffU >> plen;
1285 else
1286 hmask = 0;
1287 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1288 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1289
1290 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1291 for (i = start >> 5; i <= end >> 5; i++)
1292 da->updates_mask[i] = 0xffffffffU;
1293 else
1294 for (i = start; i <= end; i++)
1295 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1296 if (start < da->updates_low)
1297 da->updates_low = start;
1298 if (end > da->updates_high)
1299 da->updates_high = end;
1300 }
1301
1302#ifdef INVARIANTS
1303 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1304 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1305 ("%s: update count mismatch", __FUNCTION__));
1306#endif
1307
1308 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1309 if (res != FLM_SUCCESS)
1310 return (res);
1311
1312 dxr_build(new_dxr);
1313
1314 /* Structural limit exceeded, hard error */
1315 if (da->rtbl_top >= BASE_MAX) {
1316 dxr_destroy(new_dxr);
1317 return (FLM_ERROR);
1318 }
1319
1320 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1321 if (new_dxr->d == NULL) {
1322 dxr_destroy(new_dxr);
1323 return (FLM_REBUILD);
1324 }
1325
1326 new_dp.f = choose_lookup_fn(da);
1327 new_dp.arg = new_dxr;
1328 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1329 fib_set_algo_ptr(dxr->fd, new_dxr);
1330 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1331 return (FLM_SUCCESS);
1332 }
1333
1334 dxr_destroy(new_dxr);
1335 return (FLM_REBUILD);
1336}
1337
1338static uint8_t
1339dxr_get_pref(const struct rib_rtable_info *rinfo)
1340{
1341
1342 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1343 return (251);
1344}
1345
1346SYSCTL_DECL(_net_route_algo);
1347SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1348 "DXR tunables");
1349
1350static int
1351sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1352{
1353 char buf[8];
1354 int error, new, i;
1355
1356 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1357 V_frag_limit % 100);
1358 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1359 if (error != 0 || req->newptr == NULL)
1360 return (error);
1361 if (!isdigit(*buf) && *buf != '.')
1362 return (EINVAL);
1363 for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1364 new = new * 10 + buf[i] - '0';
1365 new *= 100;
1366 if (buf[i++] == '.') {
1367 if (!isdigit(buf[i]))
1368 return (EINVAL);
1369 new += (buf[i++] - '0') * 10;
1370 if (isdigit(buf[i]))
1371 new += buf[i++] - '0';
1372 }
1373 if (new > 1000)
1374 return (EINVAL);
1375 V_frag_limit = new;
1376 snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1377 V_frag_limit % 100);
1378 return (0);
1379}
1380
1381SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1382 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1383 0, 0, sysctl_dxr_frag_limit, "A",
1384 "Fragmentation threshold to full rebuild");
1385
1386static struct fib_lookup_module fib_dxr_mod = {
1387 .flm_name = "dxr",
1388 .flm_family = AF_INET,
1389 .flm_init_cb = dxr_init,
1390 .flm_destroy_cb = dxr_destroy,
1391 .flm_dump_rib_item_cb = dxr_dump_rib_item,
1392 .flm_dump_end_cb = dxr_dump_end,
1393 .flm_change_rib_item_cb = dxr_change_rib_item,
1394 .flm_change_rib_items_cb = dxr_change_rib_batch,
1395 .flm_get_pref = dxr_get_pref,
1396};
1397
1398static int
1399dxr_modevent(module_t mod, int type, void *unused)
1400{
1401 int error;
1402
1403 switch (type) {
1404 case MOD_LOAD:
1405 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1406 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1407 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1408 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1409 fib_module_register(&fib_dxr_mod);
1410 return(0);
1411 case MOD_UNLOAD:
1412 error = fib_module_unregister(&fib_dxr_mod);
1413 if (error)
1414 return (error);
1415 uma_zdestroy(chunk_zone);
1416 uma_zdestroy(trie_zone);
1417 return(0);
1418 default:
1419 return(EOPNOTSUPP);
1420 }
1421}
1422
1423static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1424
1425DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1426MODULE_VERSION(fib_dxr, 1);
__uint32_t uint32_t
Definition: in.h:62
__uint16_t uint16_t
Definition: in.h:57
__uint8_t uint8_t
Definition: in.h:52
struct rtentry * fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, struct route_nhop_data *nrd)
#define CHUNK_HASH_MASK
Definition: in_fib_dxr.c:110
#define D_TBL_SIZE
Definition: in_fib_dxr.c:82
#define FRAGS_PREF_SHORT
Definition: in_fib_dxr.c:97
static uint32_t chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
Definition: in_fib_dxr.c:383
static void chunk_unref(struct dxr_aux *da, uint32_t chunk)
Definition: in_fib_dxr.c:510
CTASSERT(DXR_TRIE_BITS >=16 &&DXR_TRIE_BITS<=24)
static void dxr_destroy(void *data)
Definition: in_fib_dxr.c:1142
#define DESC_FRAGMENTS_BITS
Definition: in_fib_dxr.c:88
static void initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
Definition: in_fib_dxr.c:356
MODULE_VERSION(fib_dxr, 1)
#define FRAGS_MARK_XL
Definition: in_fib_dxr.c:99
static enum flm_op_result dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q, void *data)
Definition: in_fib_dxr.c:1251
static void heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen, uint32_t nh)
Definition: in_fib_dxr.c:676
#define FRAGS_MASK_SHORT
Definition: in_fib_dxr.c:93
#define UNUSED_BUCKETS
Definition: in_fib_dxr.c:118
#define RTBL_SIZE_INCR
Definition: in_fib_dxr.c:90
static enum flm_op_result dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
Definition: in_fib_dxr.c:1116
#define DXR_TRIE_BITS
Definition: in_fib_dxr.c:69
#define CHUNK_HASH_SIZE
Definition: in_fib_dxr.c:109
static void epoch_dxr_destroy(epoch_context_t ctx)
Definition: in_fib_dxr.c:1173
static struct fib_lookup_module fib_dxr_mod
Definition: in_fib_dxr.c:1386
SYSCTL_DECL(_net_route_algo)
static int dxr_walk(struct rtentry *rt, void *arg)
Definition: in_fib_dxr.c:700
#define DXR_LOOKUP_DEFINE(D)
Definition: in_fib_dxr.c:296
static int sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
Definition: in_fib_dxr.c:1351
DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY)
#define DIRECT_TBL_SIZE
Definition: in_fib_dxr.c:83
#define V_frag_limit
Definition: in_fib_dxr.c:235
#define RE_SHORT_MAX_NH
Definition: in_fib_dxr.c:106
#define FRAGS_MARK_HIT
Definition: in_fib_dxr.c:100
#define IS_SHORT_FORMAT(x)
Definition: in_fib_dxr.c:102
uma_zone_t trie_zone
Definition: in_fib_dxr.c:232
#define DXR_LOOKUP_STAGE
Definition: in_fib_dxr.c:238
#define DESC_BASE_BITS
Definition: in_fib_dxr.c:87
static void trie_unref(struct dxr_aux *da, uint32_t index)
Definition: in_fib_dxr.c:644
static int range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
Definition: in_fib_dxr.c:252
SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_VNET, 0, 0, sysctl_dxr_frag_limit, "A", "Fragmentation threshold to full rebuild")
__FBSDID("$FreeBSD$")
#define TRIE_HASH_MASK
Definition: in_fib_dxr.c:114
static uint32_t chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
Definition: in_fib_dxr.c:395
#define BASE_MAX
Definition: in_fib_dxr.c:89
#define DXR_D
Definition: in_fib_dxr.c:77
static enum flm_op_result dxr_dump_end(void *data, struct fib_dp *dp)
Definition: in_fib_dxr.c:1210
#define TRIE_HASH_SIZE
Definition: in_fib_dxr.c:113
#define XTBL_SIZE_INCR
Definition: in_fib_dxr.c:116
static int update_chunk(struct dxr_aux *da, uint32_t chunk)
Definition: in_fib_dxr.c:762
static int trie_ref(struct dxr_aux *da, uint32_t index)
Definition: in_fib_dxr.c:596
static int dxr_lookup(struct dxr *dxr, uint32_t dst)
Definition: in_fib_dxr.c:336
static struct nhop_object * dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
Definition: in_fib_dxr.c:1107
#define DXR_RANGE_SHIFT
Definition: in_fib_dxr.c:85
#define IS_XL_FORMAT(x)
Definition: in_fib_dxr.c:104
static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM")
#define DXR_RANGE_MASK
Definition: in_fib_dxr.c:84
uma_zone_t chunk_zone
Definition: in_fib_dxr.c:231
static uint8_t dxr_get_pref(const struct rib_rtable_info *rinfo)
Definition: in_fib_dxr.c:1339
static int dxr_modevent(module_t mod, int type, void *unused)
Definition: in_fib_dxr.c:1399
static enum flm_op_result dxr_dump_rib_item(struct rtentry *rt, void *data)
Definition: in_fib_dxr.c:1236
static uint32_t trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
Definition: in_fib_dxr.c:579
SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW|CTLFLAG_MPSAFE, 0, "DXR tunables")
static int chunk_ref(struct dxr_aux *da, uint32_t chunk)
Definition: in_fib_dxr.c:409
static moduledata_t dxr_mod
Definition: in_fib_dxr.c:1423
static enum flm_op_result dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc, void *data)
Definition: in_fib_dxr.c:1243
VNET_DEFINE_STATIC(int, frag_limit)
static void * choose_lookup_fn(struct dxr_aux *da)
Definition: in_fib_dxr.c:1181
static void dxr_build(struct dxr *dxr)
Definition: in_fib_dxr.c:858
static LIST_HEAD(carp_softc)
Definition: ip_carp.c:333
Definition: in_fib_dxr.c:122
uint32_t fragments
Definition: in_fib_dxr.c:123
uint32_t base
Definition: in_fib_dxr.c:124
struct direct_entry * x_tbl
Definition: in_fib_dxr.c:175
struct direct_entry direct_tbl[DIRECT_TBL_SIZE]
Definition: in_fib_dxr.c:173
uint32_t updates_mask[DIRECT_TBL_SIZE/32]
Definition: in_fib_dxr.c:182
struct range_entry_long re
Definition: in_fib_dxr.c:177
struct trie_desc * trietbl[D_TBL_SIZE]
Definition: in_fib_dxr.c:183
struct fib_data * fd
Definition: in_fib_dxr.c:168
uint32_t fragments
Definition: in_fib_dxr.c:178
uint16_t d_tbl[D_TBL_SIZE]
Definition: in_fib_dxr.c:174
union dxr_aux::@1 * range_tbl
uint32_t fibnum
Definition: in_fib_dxr.c:169
int refcnt
Definition: in_fib_dxr.c:170
void * r
Definition: in_fib_dxr.c:215
void * x
Definition: in_fib_dxr.c:214
uint32_t fibnum
Definition: in_fib_dxr.c:222
uint32_t x_mask
Definition: in_fib_dxr.c:225
struct dxr_aux * aux
Definition: in_fib_dxr.c:219
struct nhop_object ** nh_tbl
Definition: in_fib_dxr.c:216
uint16_t d_shift
Definition: in_fib_dxr.c:223
uint16_t x_shift
Definition: in_fib_dxr.c:224
struct fib_data * fd
Definition: in_fib_dxr.c:220
struct epoch_context epoch_ctx
Definition: in_fib_dxr.c:221
void * d
Definition: in_fib_dxr.c:213
Definition: in_fib_dxr.c:141
uint32_t end
Definition: in_fib_dxr.c:143
uint32_t start
Definition: in_fib_dxr.c:142
uint32_t preflen
Definition: in_fib_dxr.c:144
uint32_t nexthop
Definition: in_fib_dxr.c:145
Definition: in.h:83
in_addr_t s_addr
Definition: in.h:84
Definition: ip.h:51
Definition: in_fib_dxr.c:127
uint32_t nexthop
Definition: in_fib_dxr.c:129
uint32_t start
Definition: in_fib_dxr.c:128
Definition: in_fib_dxr.c:133
uint16_t nexthop
Definition: in_fib_dxr.c:135
uint16_t start
Definition: in_fib_dxr.c:134
Definition: in.h:97