FreeBSD kernel kern code
subr_blist.c
Go to the documentation of this file.
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29/*
30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 * This module implements a general bitmap allocator/deallocator. The
33 * allocator eats around 2 bits per 'block'. The module does not
34 * try to interpret the meaning of a 'block' other than to return
35 * SWAPBLK_NONE on an allocation failure.
36 *
37 * A radix tree controls access to pieces of the bitmap, and includes
38 * auxiliary information at each interior node about the availabilty of
39 * contiguous free blocks in the subtree rooted at that node. A radix
40 * constant defines the size of the bitmaps contained in a leaf node
41 * and the number of descendents of each of the meta (interior) nodes.
42 * Each subtree is associated with a range of blocks. The root of any
43 * subtree stores a hint field that defines an upper bound on the size
44 * of the largest allocation that can begin in the associated block
45 * range. A hint is an upper bound on a potential allocation, but not
46 * necessarily a tight upper bound.
47 *
48 * The bitmap field in each node directs the search for available blocks.
49 * For a leaf node, a bit is set if the corresponding block is free. For a
50 * meta node, a bit is set if the corresponding subtree contains a free
51 * block somewhere within it. The search at a meta node considers only
52 * children of that node that represent a range that includes a free block.
53 *
54 * The hinting greatly increases code efficiency for allocations while
55 * the general radix structure optimizes both allocations and frees. The
56 * radix tree should be able to operate well no matter how much
57 * fragmentation there is and no matter how large a bitmap is used.
58 *
59 * The blist code wires all necessary memory at creation time. Neither
60 * allocations nor frees require interaction with the memory subsystem.
61 * The non-blocking nature of allocations and frees is required by swap
62 * code (vm/swap_pager.c).
63 *
64 * LAYOUT: The radix tree is laid out recursively using a linear array.
65 * Each meta node is immediately followed (laid out sequentially in
66 * memory) by BLIST_RADIX lower-level nodes. This is a recursive
67 * structure but one that can be easily scanned through a very simple
68 * 'skip' calculation. The memory allocation is only large enough to
69 * cover the number of blocks requested at creation time. Nodes that
70 * represent blocks beyond that limit, nodes that would never be read
71 * or written, are not allocated, so that the last of the
72 * BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
73 *
74 * NOTE: the allocator cannot currently allocate more than
75 * BLIST_RADIX blocks per call. It will panic with 'allocation too
76 * large' if you try. This is an area that could use improvement. The
77 * radix is large enough that this restriction does not effect the swap
78 * system, though. Currently only the allocation code is affected by
79 * this algorithmic unfeature. The freeing code can handle arbitrary
80 * ranges.
81 *
82 * This code can be compiled stand-alone for debugging.
83 */
84
85#include <sys/cdefs.h>
86__FBSDID("$FreeBSD$");
87
88#ifdef _KERNEL
89
90#include <sys/param.h>
91#include <sys/systm.h>
92#include <sys/lock.h>
93#include <sys/kernel.h>
94#include <sys/blist.h>
95#include <sys/malloc.h>
96#include <sys/sbuf.h>
97#include <sys/proc.h>
98#include <sys/mutex.h>
99
100#else
101
102#ifndef BLIST_NO_DEBUG
103#define BLIST_DEBUG
104#endif
105
106#include <sys/errno.h>
107#include <sys/types.h>
108#include <sys/malloc.h>
109#include <sys/sbuf.h>
110#include <assert.h>
111#include <stdio.h>
112#include <string.h>
113#include <stddef.h>
114#include <stdlib.h>
115#include <stdarg.h>
116#include <stdbool.h>
117
118#define bitcount64(x) __bitcount64((uint64_t)(x))
119#define malloc(a,b,c) calloc(a, 1)
120#define free(a,b) free(a)
121#define ummin(a,b) ((a) < (b) ? (a) : (b))
122#define imin(a,b) ((a) < (b) ? (a) : (b))
123#define KASSERT(a,b) assert(a)
124
125#include <sys/blist.h>
126
127#endif
128
129/*
130 * static support functions
131 */
132static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
133 int *count, int maxcount);
134static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
135 int maxcount, u_daddr_t radix);
136static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 u_daddr_t radix);
139static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 blist_t dest, daddr_t count);
141static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 u_daddr_t radix);
144#ifndef _KERNEL
145static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 int tab);
147#endif
148
149#ifdef _KERNEL
150static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151#endif
152
153#define BLIST_MASK (BLIST_RADIX - 1)
154
155/*
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm'
158 * is short for BLIST_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees. Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / m)) / (m - 1)
165 * skip = radix / (m - 1)
166 * so that simple integer division by a constant can safely be used for the
167 * calculation.
168 */
169static inline daddr_t
170radix_to_skip(daddr_t radix)
171{
172
173 return (radix / BLIST_MASK);
174}
175
176/*
177 * Provide a mask with count bits set, starting as position n.
178 */
179static inline u_daddr_t
180bitrange(int n, int count)
181{
182
183 return (((u_daddr_t)-1 << n) &
184 ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
185}
186
187/*
188 * Find the first bit set in a u_daddr_t.
189 */
190static inline int
192{
193 int hi, lo, mid;
194
195 lo = 0;
196 hi = BLIST_RADIX;
197 while (lo + 1 < hi) {
198 mid = (lo + hi) >> 1;
199 if (mask & bitrange(0, mid))
200 hi = mid;
201 else
202 lo = mid;
203 }
204 return (lo);
205}
206
207static inline int
208bitpos(u_daddr_t mask)
209{
210
211 switch (sizeof(mask)) {
212#ifdef HAVE_INLINE_FFSLL
213 case sizeof(long long):
214 return (ffsll(mask) - 1);
215#endif
216#ifdef HAVE_INLINE_FFS
217 case sizeof(int):
218 return (ffs(mask) - 1);
219#endif
220 default:
221 return (generic_bitpos(mask));
222 }
223}
224
225/*
226 * blist_create() - create a blist capable of handling up to the specified
227 * number of blocks
228 *
229 * blocks - must be greater than 0
230 * flags - malloc flags
231 *
232 * The smallest blist consists of a single leaf node capable of
233 * managing BLIST_RADIX blocks.
234 */
235blist_t
236blist_create(daddr_t blocks, int flags)
237{
238 blist_t bl;
239 u_daddr_t nodes, radix;
240
241 KASSERT(blocks > 0, ("invalid block count"));
242
243 /*
244 * Calculate the radix and node count used for scanning.
245 */
246 nodes = 1;
247 for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
248 radix *= BLIST_RADIX)
249 nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
250
251 /*
252 * Include a sentinel node to ensure that cross-leaf scans stay within
253 * the bounds of the allocation.
254 */
255 if (blocks % BLIST_RADIX == 0)
256 nodes++;
257
258 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
259 M_ZERO);
260 if (bl == NULL)
261 return (NULL);
262
263 bl->bl_blocks = blocks;
264 bl->bl_radix = radix;
265
266#if defined(BLIST_DEBUG)
267 printf(
268 "BLIST representing %lld blocks (%lld MB of swap)"
269 ", requiring %lldK of ram\n",
270 (long long)bl->bl_blocks,
271 (long long)bl->bl_blocks * 4 / 1024,
272 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
273 );
274 printf("BLIST raw radix tree contains %lld records\n",
275 (long long)nodes);
276#endif
277
278 return (bl);
279}
280
281void
282blist_destroy(blist_t bl)
283{
284
285 free(bl, M_SWAP);
286}
287
288/*
289 * blist_alloc() - reserve space in the block bitmap. Return the base
290 * of a contiguous region or SWAPBLK_NONE if space could
291 * not be allocated.
292 */
293daddr_t
294blist_alloc(blist_t bl, int *count, int maxcount)
295{
296 daddr_t blk, cursor;
297
298 KASSERT(*count <= maxcount,
299 ("invalid parameters %d > %d", *count, maxcount));
300 KASSERT(*count <= BLIST_MAX_ALLOC,
301 ("minimum allocation too large: %d", *count));
302
303 /*
304 * This loop iterates at most twice. An allocation failure in the
305 * first iteration leads to a second iteration only if the cursor was
306 * non-zero. When the cursor is zero, an allocation failure will
307 * stop further iterations.
308 */
309 for (cursor = bl->bl_cursor;; cursor = 0) {
310 blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
311 bl->bl_radix);
312 if (blk != SWAPBLK_NONE) {
313 bl->bl_avail -= *count;
314 bl->bl_cursor = blk + *count;
315 if (bl->bl_cursor == bl->bl_blocks)
316 bl->bl_cursor = 0;
317 return (blk);
318 }
319 if (cursor == 0)
320 return (SWAPBLK_NONE);
321 }
322}
323
324/*
325 * blist_avail() - return the number of free blocks.
326 */
327daddr_t
328blist_avail(blist_t bl)
329{
330
331 return (bl->bl_avail);
332}
333
334/*
335 * blist_free() - free up space in the block bitmap. Return the base
336 * of a contiguous region.
337 */
338void
339blist_free(blist_t bl, daddr_t blkno, daddr_t count)
340{
341
342 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
343 ("freeing invalid range: blkno %jx, count %d, blocks %jd",
344 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
345 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
346 bl->bl_avail += count;
347}
348
349/*
350 * blist_fill() - mark a region in the block bitmap as off-limits
351 * to the allocator (i.e. allocate it), ignoring any
352 * existing allocations. Return the number of blocks
353 * actually filled that were free before the call.
354 */
355daddr_t
356blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
357{
358 daddr_t filled;
359
360 KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
361 ("filling invalid range: blkno %jx, count %d, blocks %jd",
362 (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
363 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
364 bl->bl_avail -= filled;
365 return (filled);
366}
367
368/*
369 * blist_resize() - resize an existing radix tree to handle the
370 * specified number of blocks. This will reallocate
371 * the tree and transfer the previous bitmap to the new
372 * one. When extending the tree you can specify whether
373 * the new blocks are to left allocated or freed.
374 */
375void
376blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
377{
378 blist_t newbl = blist_create(count, flags);
379 blist_t save = *pbl;
380
381 *pbl = newbl;
382 if (count > save->bl_blocks)
383 count = save->bl_blocks;
384 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
385
386 /*
387 * If resizing upwards, should we free the new space or not?
388 */
389 if (freenew && count < newbl->bl_blocks) {
390 blist_free(newbl, count, newbl->bl_blocks - count);
391 }
392 blist_destroy(save);
393}
394
395#ifdef BLIST_DEBUG
396
397/*
398 * blist_print() - dump radix tree
399 */
400void
401blist_print(blist_t bl)
402{
403 printf("BLIST avail = %jd, cursor = %08jx {\n",
404 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
405
406 if (bl->bl_root->bm_bitmap != 0)
407 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
408 printf("}\n");
409}
410
411#endif
412
413static const u_daddr_t fib[] = {
414 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
415 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
416 514229, 832040, 1346269, 2178309, 3524578,
417};
418
419/*
420 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
421 */
422struct gap_stats {
423 daddr_t start; /* current gap start, or SWAPBLK_NONE */
424 daddr_t num; /* number of gaps observed */
425 daddr_t max; /* largest gap size */
426 daddr_t avg; /* average gap size */
427 daddr_t err; /* sum - num * avg */
428 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
429 int max_bucket; /* last histo elt with nonzero val */
430};
431
432/*
433 * gap_stats_counting() - is the state 'counting 1 bits'?
434 * or 'skipping 0 bits'?
435 */
436static inline bool
437gap_stats_counting(const struct gap_stats *stats)
438{
439
440 return (stats->start != SWAPBLK_NONE);
441}
442
443/*
444 * init_gap_stats() - initialize stats on gap sizes
445 */
446static inline void
448{
449
450 bzero(stats, sizeof(*stats));
451 stats->start = SWAPBLK_NONE;
452}
453
454/*
455 * update_gap_stats() - update stats on gap sizes
456 */
457static void
458update_gap_stats(struct gap_stats *stats, daddr_t posn)
459{
460 daddr_t size;
461 int hi, lo, mid;
462
463 if (!gap_stats_counting(stats)) {
464 stats->start = posn;
465 return;
466 }
467 size = posn - stats->start;
468 stats->start = SWAPBLK_NONE;
469 if (size > stats->max)
470 stats->max = size;
471
472 /*
473 * Find the fibonacci range that contains size,
474 * expecting to find it in an early range.
475 */
476 lo = 0;
477 hi = 1;
478 while (hi < nitems(fib) && fib[hi] <= size) {
479 lo = hi;
480 hi *= 2;
481 }
482 if (hi >= nitems(fib))
483 hi = nitems(fib);
484 while (lo + 1 != hi) {
485 mid = (lo + hi) >> 1;
486 if (fib[mid] <= size)
487 lo = mid;
488 else
489 hi = mid;
490 }
491 stats->histo[lo]++;
492 if (lo > stats->max_bucket)
493 stats->max_bucket = lo;
494 stats->err += size - stats->avg;
495 stats->num++;
496 stats->avg += stats->err / stats->num;
497 stats->err %= stats->num;
498}
499
500/*
501 * dump_gap_stats() - print stats on gap sizes
502 */
503static inline void
504dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
505{
506 int i;
507
508 sbuf_printf(s, "number of maximal free ranges: %jd\n",
509 (intmax_t)stats->num);
510 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
511 sbuf_printf(s, "average maximal free range size: %jd\n",
512 (intmax_t)stats->avg);
513 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
514 sbuf_printf(s, " count | size range\n");
515 sbuf_printf(s, " ----- | ----------\n");
516 for (i = 0; i < stats->max_bucket; i++) {
517 if (stats->histo[i] != 0) {
518 sbuf_printf(s, "%20jd | ",
519 (intmax_t)stats->histo[i]);
520 if (fib[i] != fib[i + 1] - 1)
521 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
522 (intmax_t)fib[i + 1] - 1);
523 else
524 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
525 }
526 }
527 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
528 if (stats->histo[i] > 1)
529 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
530 (intmax_t)stats->max);
531 else
532 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
533}
534
535/*
536 * blist_stats() - dump radix tree stats
537 */
538void
539blist_stats(blist_t bl, struct sbuf *s)
540{
541 struct gap_stats gstats;
542 struct gap_stats *stats = &gstats;
543 daddr_t i, nodes, radix;
544 u_daddr_t diff, mask;
545 int digit;
546
547 init_gap_stats(stats);
548 nodes = 0;
549 radix = bl->bl_radix;
550 for (i = 0; i < bl->bl_blocks; ) {
551 /*
552 * Check for skippable subtrees starting at i.
553 */
554 while (radix != 1) {
555 if (bl->bl_root[nodes].bm_bitmap == 0) {
556 if (gap_stats_counting(stats))
557 update_gap_stats(stats, i);
558 break;
559 }
560
561 /*
562 * Skip subtree root.
563 */
564 nodes++;
565 radix /= BLIST_RADIX;
566 }
567 if (radix == 1) {
568 /*
569 * Scan leaf.
570 */
571 mask = bl->bl_root[nodes].bm_bitmap;
572 diff = mask ^ (mask << 1);
573 if (gap_stats_counting(stats))
574 diff ^= 1;
575 while (diff != 0) {
576 digit = bitpos(diff);
577 update_gap_stats(stats, i + digit);
578 diff ^= bitrange(digit, 1);
579 }
580 }
581 nodes += radix_to_skip(radix * BLIST_RADIX);
582 i += radix * BLIST_RADIX;
583
584 /*
585 * Find max size subtree starting at i.
586 */
587 for (radix = 1;
588 ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
589 radix *= BLIST_RADIX)
590 ;
591 }
592 update_gap_stats(stats, i);
593 dump_gap_stats(stats, s);
594}
595
596/************************************************************************
597 * ALLOCATION SUPPORT FUNCTIONS *
598 ************************************************************************
599 *
600 * These support functions do all the actual work. They may seem
601 * rather longish, but that's because I've commented them up. The
602 * actual code is straight forward.
603 *
604 */
605
606/*
607 * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
608 *
609 * 'scan' is a leaf node, and its first block is at address 'start'. The
610 * next leaf node could be adjacent, or several nodes away if the least
611 * common ancestor of 'scan' and its neighbor is several levels up. Use
612 * addresses to determine how many meta-nodes lie between the leaves. If
613 * sequence of leaves starting with the next one has enough initial bits
614 * set, clear them and clear the bits in the meta nodes on the path up to
615 * the least common ancestor to mark any subtrees made completely empty.
616 */
617static int
618blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
619{
620 u_daddr_t radix;
621 daddr_t blk;
622 int avail, digit;
623
624 start += BLIST_RADIX;
625 for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
626 /* Skip meta-nodes, as long as they promise more free blocks. */
627 radix = BLIST_RADIX;
628 while (((++scan)->bm_bitmap & 1) == 1 &&
629 ((blk / radix) & BLIST_MASK) == 0)
630 radix *= BLIST_RADIX;
631 if (~scan->bm_bitmap != 0) {
632 /*
633 * Either there is no next leaf with any free blocks,
634 * or we've reached the next leaf and found that some
635 * of its blocks are not free. In the first case,
636 * bitpos() returns zero here.
637 */
638 avail = blk - start + bitpos(~scan->bm_bitmap);
639 if (avail < count || avail == 0) {
640 /*
641 * There isn't a next leaf with enough free
642 * blocks at its beginning to bother
643 * allocating.
644 */
645 return (avail);
646 }
647 maxcount = imin(avail, maxcount);
648 if (maxcount % BLIST_RADIX == 0) {
649 /*
650 * There was no next leaf. Back scan up to
651 * last leaf.
652 */
653 do {
654 radix /= BLIST_RADIX;
655 --scan;
656 } while (radix != 1);
657 blk -= BLIST_RADIX;
658 }
659 }
660 }
661
662 /*
663 * 'scan' is the last leaf that provides blocks. Clear from 1 to
664 * BLIST_RADIX bits to represent the allocation of those last blocks.
665 */
666 if (maxcount % BLIST_RADIX != 0)
667 scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
668 else
669 scan->bm_bitmap = 0;
670
671 for (;;) {
672 /* Back up over meta-nodes, clearing bits if necessary. */
673 blk -= BLIST_RADIX;
674 for (radix = BLIST_RADIX;
675 (digit = ((blk / radix) & BLIST_MASK)) == 0;
676 radix *= BLIST_RADIX) {
677 if ((scan--)->bm_bitmap == 0)
678 scan->bm_bitmap ^= 1;
679 }
680 if ((scan--)->bm_bitmap == 0)
681 scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
682 (u_daddr_t)1 << digit;
683
684 if (blk == start)
685 break;
686 /* Clear all the bits of this leaf. */
687 scan->bm_bitmap = 0;
688 }
689 return (maxcount);
690}
691
692/*
693 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap).
694 *
695 * This function is the core of the allocator. Its execution time is
696 * proportional to log(count), plus height of the tree if the allocation
697 * crosses a leaf boundary.
698 */
699static daddr_t
700blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
701{
702 u_daddr_t mask;
703 int bighint, count1, hi, lo, num_shifts;
704
705 count1 = *count - 1;
706 num_shifts = fls(count1);
707 mask = ~scan->bm_bitmap;
708 while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
709 /*
710 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
711 * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to
712 * 0, while preserving this invariant. The updates to mask
713 * leave fewer bits 0, but each bit that remains 0 represents a
714 * longer string of consecutive 1-bits in scan->bm_bitmap. If
715 * more updates to mask cannot set more bits, because mask is
716 * partitioned with all 1 bits following all 0 bits, the loop
717 * terminates immediately.
718 */
719 num_shifts--;
720 mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
721 }
722 bighint = count1 >> num_shifts;
723 if (~mask == 0) {
724 /*
725 * Update bighint. There is no allocation bigger than
726 * count1 >> num_shifts starting in this leaf.
727 */
728 scan->bm_bighint = bighint;
729 return (SWAPBLK_NONE);
730 }
731
732 /* Discard any candidates that appear before blk. */
733 if ((blk & BLIST_MASK) != 0) {
734 if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
735 /* Grow bighint in case all discarded bits are set. */
736 bighint += blk & BLIST_MASK;
737 mask |= bitrange(0, blk & BLIST_MASK);
738 if (~mask == 0) {
739 scan->bm_bighint = bighint;
740 return (SWAPBLK_NONE);
741 }
742 }
743 blk -= blk & BLIST_MASK;
744 }
745
746 /*
747 * The least significant set bit in mask marks the start of the first
748 * available range of sufficient size. Find its position.
749 */
750 lo = bitpos(~mask);
751
752 /*
753 * Find how much space is available starting at that position.
754 */
755 if ((mask & (mask + 1)) != 0) {
756 /* Count the 1 bits starting at position lo. */
757 hi = bitpos(mask & (mask + 1)) + count1;
758 if (maxcount < hi - lo)
759 hi = lo + maxcount;
760 *count = hi - lo;
761 mask = ~bitrange(lo, *count);
762 } else if (maxcount <= BLIST_RADIX - lo) {
763 /* All the blocks we can use are available here. */
764 hi = lo + maxcount;
765 *count = maxcount;
766 mask = ~bitrange(lo, *count);
767 if (hi == BLIST_RADIX)
768 scan->bm_bighint = bighint;
769 } else {
770 /* Check next leaf for some of the blocks we want or need. */
771 count1 = *count - (BLIST_RADIX - lo);
772 maxcount -= BLIST_RADIX - lo;
773 hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
774 if (hi < count1)
775 /*
776 * The next leaf cannot supply enough blocks to reach
777 * the minimum required allocation. The hint cannot be
778 * updated, because the same allocation request could
779 * be satisfied later, by this leaf, if the state of
780 * the next leaf changes, and without any changes to
781 * this leaf.
782 */
783 return (SWAPBLK_NONE);
784 *count = BLIST_RADIX - lo + hi;
785 scan->bm_bighint = bighint;
786 }
787
788 /* Clear the allocated bits from this leaf. */
789 scan->bm_bitmap &= mask;
790 return (blk + lo);
791}
792
793/*
794 * blist_meta_alloc() - allocate at a meta in the radix tree.
795 *
796 * Attempt to allocate at a meta node. If we can't, we update
797 * bighint and return a failure. Updating bighint optimize future
798 * calls that hit this node. We have to check for our collapse cases
799 * and we have a few optimizations strewn in as well.
800 */
801static daddr_t
802blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
803 int maxcount, u_daddr_t radix)
804{
805 daddr_t blk, i, r, skip;
806 u_daddr_t mask;
807 bool scan_from_start;
808 int digit;
809
810 if (radix == 1)
811 return (blst_leaf_alloc(scan, cursor, count, maxcount));
812 blk = cursor & -(radix * BLIST_RADIX);
813 scan_from_start = (cursor == blk);
814 skip = radix_to_skip(radix);
815 mask = scan->bm_bitmap;
816
817 /* Discard any candidates that appear before cursor. */
818 digit = (cursor / radix) & BLIST_MASK;
819 mask &= (u_daddr_t)-1 << digit;
820 if (mask == 0)
821 return (SWAPBLK_NONE);
822
823 /*
824 * If the first try is for a block that includes the cursor, pre-undo
825 * the digit * radix offset in the first call; otherwise, ignore the
826 * cursor entirely.
827 */
828 if (((mask >> digit) & 1) == 1)
829 cursor -= digit * radix;
830 else
831 cursor = blk;
832
833 /*
834 * Examine the nonempty subtree associated with each bit set in mask.
835 */
836 do {
837 digit = bitpos(mask);
838 i = 1 + digit * skip;
839 if (*count <= scan[i].bm_bighint) {
840 /*
841 * The allocation might fit beginning in the i'th subtree.
842 */
843 r = blst_meta_alloc(&scan[i], cursor + digit * radix,
844 count, maxcount, radix / BLIST_RADIX);
845 if (r != SWAPBLK_NONE) {
846 if (scan[i].bm_bitmap == 0)
847 scan->bm_bitmap ^= bitrange(digit, 1);
848 return (r);
849 }
850 }
851 cursor = blk;
852 } while ((mask ^= bitrange(digit, 1)) != 0);
853
854 /*
855 * We couldn't allocate count in this subtree. If the whole tree was
856 * scanned, and the last tree node is allocated, update bighint.
857 */
858 if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
859 scan[i].bm_bighint == BLIST_MAX_ALLOC))
860 scan->bm_bighint = *count - 1;
861
862 return (SWAPBLK_NONE);
863}
864
865/*
866 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
867 *
868 */
869static void
870blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
871{
872 u_daddr_t mask;
873
874 /*
875 * free some data in this bitmap
876 * mask=0000111111111110000
877 * \_________/\__/
878 * count n
879 */
880 mask = bitrange(blk & BLIST_MASK, count);
881 KASSERT((scan->bm_bitmap & mask) == 0,
882 ("freeing free block: %jx, size %d, mask %jx",
883 (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
884 scan->bm_bitmap |= mask;
885}
886
887/*
888 * BLST_META_FREE() - free allocated blocks from radix tree meta info
889 *
890 * This support routine frees a range of blocks from the bitmap.
891 * The range must be entirely enclosed by this radix node. If a
892 * meta node, we break the range down recursively to free blocks
893 * in subnodes (which means that this code can free an arbitrary
894 * range whereas the allocation code cannot allocate an arbitrary
895 * range).
896 */
897static void
898blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
899{
900 daddr_t blk, endBlk, i, skip;
901 int digit, endDigit;
902
903 /*
904 * We could probably do a better job here. We are required to make
905 * bighint at least as large as the biggest allocable block of data.
906 * If we just shoehorn it, a little extra overhead will be incurred
907 * on the next allocation (but only that one typically).
908 */
909 scan->bm_bighint = BLIST_MAX_ALLOC;
910
911 if (radix == 1)
912 return (blst_leaf_free(scan, freeBlk, count));
913
914 endBlk = freeBlk + count;
915 blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
916 /*
917 * blk is first block past the end of the range of this meta node,
918 * or 0 in case of overflow.
919 */
920 if (blk != 0)
921 endBlk = ummin(endBlk, blk);
922 skip = radix_to_skip(radix);
923 blk = freeBlk & -radix;
924 digit = (blk / radix) & BLIST_MASK;
925 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
926 scan->bm_bitmap |= bitrange(digit, endDigit - digit);
927 for (i = 1 + digit * skip; blk < endBlk; i += skip) {
928 blk += radix;
929 count = ummin(blk, endBlk) - freeBlk;
930 blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
931 freeBlk = blk;
932 }
933}
934
935/*
936 * BLST_COPY() - copy one radix tree to another
937 *
938 * Locates free space in the source tree and frees it in the destination
939 * tree. The space may not already be free in the destination.
940 */
941static void
942blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
943 daddr_t count)
944{
945 daddr_t endBlk, i, skip;
946
947 /*
948 * Leaf node
949 */
950
951 if (radix == 1) {
952 u_daddr_t v = scan->bm_bitmap;
953
954 if (v == (u_daddr_t)-1) {
955 blist_free(dest, blk, count);
956 } else if (v != 0) {
957 int i;
958
959 for (i = 0; i < count; ++i) {
960 if (v & ((u_daddr_t)1 << i))
961 blist_free(dest, blk + i, 1);
962 }
963 }
964 return;
965 }
966
967 /*
968 * Meta node
969 */
970
971 if (scan->bm_bitmap == 0) {
972 /*
973 * Source all allocated, leave dest allocated
974 */
975 return;
976 }
977
978 endBlk = blk + count;
979 skip = radix_to_skip(radix);
980 for (i = 1; blk < endBlk; i += skip) {
981 blk += radix;
982 count = radix;
983 if (blk >= endBlk)
984 count -= blk - endBlk;
985 blst_copy(&scan[i], blk - radix,
986 radix / BLIST_RADIX, dest, count);
987 }
988}
989
990/*
991 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
992 *
993 * This routine allocates all blocks in the specified range
994 * regardless of any existing allocations in that range. Returns
995 * the number of blocks allocated by the call.
996 */
997static daddr_t
998blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
999{
1000 daddr_t nblks;
1001 u_daddr_t mask;
1002
1003 mask = bitrange(blk & BLIST_MASK, count);
1004
1005 /* Count the number of blocks that we are allocating. */
1006 nblks = bitcount64(scan->bm_bitmap & mask);
1007
1008 scan->bm_bitmap &= ~mask;
1009 return (nblks);
1010}
1011
1012/*
1013 * BLIST_META_FILL() - allocate specific blocks at a meta node
1014 *
1015 * This routine allocates the specified range of blocks,
1016 * regardless of any existing allocations in the range. The
1017 * range must be within the extent of this node. Returns the
1018 * number of blocks allocated by the call.
1019 */
1020static daddr_t
1021blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1022{
1023 daddr_t blk, endBlk, i, nblks, skip;
1024 int digit;
1025
1026 if (radix == 1)
1027 return (blst_leaf_fill(scan, allocBlk, count));
1028
1029 endBlk = allocBlk + count;
1030 blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
1031 /*
1032 * blk is first block past the end of the range of this meta node,
1033 * or 0 in case of overflow.
1034 */
1035 if (blk != 0)
1036 endBlk = ummin(endBlk, blk);
1037 skip = radix_to_skip(radix);
1038 blk = allocBlk & -radix;
1039 nblks = 0;
1040 while (blk < endBlk) {
1041 digit = (blk / radix) & BLIST_MASK;
1042 i = 1 + digit * skip;
1043 blk += radix;
1044 count = ummin(blk, endBlk) - allocBlk;
1045 nblks += blst_meta_fill(&scan[i], allocBlk, count,
1046 radix / BLIST_RADIX);
1047 if (scan[i].bm_bitmap == 0)
1048 scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
1049 allocBlk = blk;
1050 }
1051 return (nblks);
1052}
1053
1054#ifdef BLIST_DEBUG
1055
1056static void
1057blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1058{
1059 daddr_t skip;
1060 u_daddr_t mask;
1061 int digit;
1062
1063 if (radix == 1) {
1064 printf(
1065 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1066 tab, tab, "",
1067 (long long)blk, (long long)BLIST_RADIX,
1068 (int)(1 + (BLIST_RADIX - 1) / 4),
1069 (long long)scan->bm_bitmap,
1070 (long long)scan->bm_bighint
1071 );
1072 return;
1073 }
1074
1075 printf(
1076 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1077 tab, tab, "",
1078 (long long)blk, (long long)radix * BLIST_RADIX,
1079 (long long)radix * BLIST_RADIX,
1080 (int)(1 + (BLIST_RADIX - 1) / 4),
1081 (long long)scan->bm_bitmap,
1082 (long long)scan->bm_bighint
1083 );
1084
1085 skip = radix_to_skip(radix);
1086 tab += 4;
1087
1088 mask = scan->bm_bitmap;
1089 /* Examine the nonempty subtree associated with each bit set in mask */
1090 do {
1091 digit = bitpos(mask);
1092 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1093 radix / BLIST_RADIX, tab);
1094 } while ((mask ^= bitrange(digit, 1)) != 0);
1095 tab -= 4;
1096
1097 printf(
1098 "%*.*s}\n",
1099 tab, tab, ""
1100 );
1101}
1102
1103#endif
1104
1105#ifdef BLIST_DEBUG
1106
1107int
1108main(int ac, char **av)
1109{
1110 daddr_t size = BLIST_RADIX * BLIST_RADIX;
1111 int i;
1112 blist_t bl;
1113 struct sbuf *s;
1114
1115 for (i = 1; i < ac; ++i) {
1116 const char *ptr = av[i];
1117 if (*ptr != '-') {
1118 size = strtoll(ptr, NULL, 0);
1119 continue;
1120 }
1121 ptr += 2;
1122 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1123 exit(1);
1124 }
1125 bl = blist_create(size, M_WAITOK);
1126 if (bl == NULL) {
1127 fprintf(stderr, "blist_create failed\n");
1128 exit(1);
1129 }
1130 blist_free(bl, 0, size);
1131
1132 for (;;) {
1133 char buf[1024];
1134 long long da = 0;
1135 int count = 0, maxcount = 0;
1136
1137 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1138 (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
1139 fflush(stdout);
1140 if (fgets(buf, sizeof(buf), stdin) == NULL)
1141 break;
1142 switch(buf[0]) {
1143 case 'r':
1144 if (sscanf(buf + 1, "%d", &count) == 1) {
1145 blist_resize(&bl, count, 1, M_WAITOK);
1146 } else {
1147 printf("?\n");
1148 }
1149 case 'p':
1150 blist_print(bl);
1151 break;
1152 case 's':
1153 s = sbuf_new_auto();
1154 blist_stats(bl, s);
1155 sbuf_finish(s);
1156 printf("%s", sbuf_data(s));
1157 sbuf_delete(s);
1158 break;
1159 case 'a':
1160 if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
1161 daddr_t blk = blist_alloc(bl, &count, maxcount);
1162 printf(" R=%08llx, c=%08d\n",
1163 (long long)blk, count);
1164 } else {
1165 printf("?\n");
1166 }
1167 break;
1168 case 'f':
1169 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1170 blist_free(bl, da, count);
1171 } else {
1172 printf("?\n");
1173 }
1174 break;
1175 case 'l':
1176 if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
1177 printf(" n=%jd\n",
1178 (intmax_t)blist_fill(bl, da, count));
1179 } else {
1180 printf("?\n");
1181 }
1182 break;
1183 case '?':
1184 case 'h':
1185 puts(
1186 "p -print\n"
1187 "s -stats\n"
1188 "a %d %d -allocate\n"
1189 "f %x %d -free\n"
1190 "l %x %d -fill\n"
1191 "r %d -resize\n"
1192 "h/? -help\n"
1193 "q -quit"
1194 );
1195 break;
1196 case 'q':
1197 break;
1198 default:
1199 printf("?\n");
1200 break;
1201 }
1202 if (buf[0] == 'q')
1203 break;
1204 }
1205 return (0);
1206}
1207
1208#endif
int * count
Definition: cpufreq_if.m:63
void *() malloc(size_t size, struct malloc_type *mtp, int flags)
Definition: kern_malloc.c:632
void free(void *addr, struct malloc_type *mtp)
Definition: kern_malloc.c:907
void *** start
Definition: linker_if.m:98
int maxcount
Definition: msi_if.m:60
daddr_t max
Definition: subr_blist.c:425
daddr_t avg
Definition: subr_blist.c:426
daddr_t num
Definition: subr_blist.c:424
int max_bucket
Definition: subr_blist.c:429
daddr_t start
Definition: subr_blist.c:423
daddr_t histo[nitems(fib)]
Definition: subr_blist.c:428
daddr_t err
Definition: subr_blist.c:427
int mask
Definition: subr_acl_nfs4.c:70
static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, int maxcount, u_daddr_t radix)
Definition: subr_blist.c:802
static void init_gap_stats(struct gap_stats *stats)
Definition: subr_blist.c:447
static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space")
static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
Definition: subr_blist.c:998
static int blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
Definition: subr_blist.c:618
daddr_t blist_avail(blist_t bl)
Definition: subr_blist.c:328
static void dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
Definition: subr_blist.c:504
static bool gap_stats_counting(const struct gap_stats *stats)
Definition: subr_blist.c:437
blist_t blist_create(daddr_t blocks, int flags)
Definition: subr_blist.c:236
static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
Definition: subr_blist.c:700
__FBSDID("$FreeBSD$")
static u_daddr_t bitrange(int n, int count)
Definition: subr_blist.c:180
void blist_destroy(blist_t bl)
Definition: subr_blist.c:282
static int bitpos(u_daddr_t mask)
Definition: subr_blist.c:208
daddr_t blist_alloc(blist_t bl, int *count, int maxcount)
Definition: subr_blist.c:294
#define BLIST_MASK
Definition: subr_blist.c:153
void blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
Definition: subr_blist.c:376
daddr_t blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
Definition: subr_blist.c:356
static void update_gap_stats(struct gap_stats *stats, daddr_t posn)
Definition: subr_blist.c:458
static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
Definition: subr_blist.c:1021
void blist_stats(blist_t bl, struct sbuf *s)
Definition: subr_blist.c:539
void blist_free(blist_t bl, daddr_t blkno, daddr_t count)
Definition: subr_blist.c:339
static const u_daddr_t fib[]
Definition: subr_blist.c:413
static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, daddr_t count)
Definition: subr_blist.c:942
static int generic_bitpos(u_daddr_t mask)
Definition: subr_blist.c:191
static daddr_t radix_to_skip(daddr_t radix)
Definition: subr_blist.c:170
static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
Definition: subr_blist.c:898
static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count)
Definition: subr_blist.c:870
int printf(const char *fmt,...)
Definition: subr_prf.c:397
int sbuf_finish(struct sbuf *s)
Definition: subr_sbuf.c:833
void sbuf_delete(struct sbuf *s)
Definition: subr_sbuf.c:898
int sbuf_printf(struct sbuf *s, const char *fmt,...)
Definition: subr_sbuf.c:739
char * sbuf_data(struct sbuf *s)
Definition: subr_sbuf.c:862
int sscanf(const char *ibuf, const char *fmt,...)
Definition: subr_scanf.c:96
uint16_t flags
Definition: subr_stats.c:2
struct stat * buf