FreeBSD kernel IPv4 code
in_pcb.c
Go to the documentation of this file.
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Robert N. M. Watson under
11 * contract to Juniper Networks, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD$");
42
43#include "opt_ddb.h"
44#include "opt_ipsec.h"
45#include "opt_inet.h"
46#include "opt_inet6.h"
47#include "opt_ratelimit.h"
48#include "opt_route.h"
49#include "opt_rss.h"
50
51#include <sys/param.h>
52#include <sys/hash.h>
53#include <sys/systm.h>
54#include <sys/libkern.h>
55#include <sys/lock.h>
56#include <sys/malloc.h>
57#include <sys/mbuf.h>
58#include <sys/callout.h>
59#include <sys/eventhandler.h>
60#include <sys/domain.h>
61#include <sys/protosw.h>
62#include <sys/smp.h>
63#include <sys/socket.h>
64#include <sys/socketvar.h>
65#include <sys/sockio.h>
66#include <sys/priv.h>
67#include <sys/proc.h>
68#include <sys/refcount.h>
69#include <sys/jail.h>
70#include <sys/kernel.h>
71#include <sys/sysctl.h>
72
73#ifdef DDB
74#include <ddb/ddb.h>
75#endif
76
77#include <vm/uma.h>
78#include <vm/vm.h>
79
80#include <net/if.h>
81#include <net/if_var.h>
82#include <net/if_types.h>
83#include <net/if_llatbl.h>
84#include <net/route.h>
85#include <net/rss_config.h>
86#include <net/vnet.h>
87
88#if defined(INET) || defined(INET6)
89#include <netinet/in.h>
90#include <netinet/in_pcb.h>
91#include <netinet/in_pcb_var.h>
92#ifdef INET
93#include <netinet/in_var.h>
94#include <netinet/in_fib.h>
95#endif
96#include <netinet/ip_var.h>
97#include <netinet/tcp_var.h>
98#ifdef TCPHPTS
99#include <netinet/tcp_hpts.h>
100#endif
101#include <netinet/udp.h>
102#include <netinet/udp_var.h>
103#ifdef INET6
104#include <netinet/ip6.h>
105#include <netinet6/in6_pcb.h>
106#include <netinet6/in6_var.h>
107#include <netinet6/ip6_var.h>
108#endif /* INET6 */
109#include <net/route/nhop.h>
110#endif
111
112#include <netipsec/ipsec_support.h>
113
114#include <security/mac/mac_framework.h>
115
116#define INPCBLBGROUP_SIZMIN 8
117#define INPCBLBGROUP_SIZMAX 256
118#define INP_FREED 0x00000200 /* See in_pcb.h. */
119
120static struct callout ipport_tick_callout;
121
122/*
123 * These configure the range of local port addresses assigned to
124 * "unspecified" outgoing connections/packets/whatever.
125 */
126VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
127VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
128VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
129VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
130VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
131VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
132
133/*
134 * Reserved ports accessible only to root. There are significant
135 * security considerations that must be accounted for when changing these,
136 * but the security benefits can be great. Please be careful.
137 */
138VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
139VNET_DEFINE(int, ipport_reservedlow);
140
141/* Variables dealing with random ephemeral port allocation. */
142VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
143VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
144VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
145VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
146VNET_DEFINE(int, ipport_tcpallocs);
147VNET_DEFINE_STATIC(int, ipport_tcplastcount);
148
149#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
150
151#ifdef INET
152static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
153 struct in_addr faddr, u_int fport_arg,
154 struct in_addr laddr, u_int lport_arg,
155 int lookupflags, struct ifnet *ifp,
156 uint8_t numa_domain);
157
158#define RANGECHK(var, min, max) \
159 if ((var) < (min)) { (var) = (min); } \
160 else if ((var) > (max)) { (var) = (max); }
161
162static int
163sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
164{
165 int error;
166
167 error = sysctl_handle_int(oidp, arg1, arg2, req);
168 if (error == 0) {
169 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
170 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
171 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
172 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
173 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
174 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
175 }
176 return (error);
177}
178
179#undef RANGECHK
180
181static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
182 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
183 "IP Ports");
184
185SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
186 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
187 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
188 "");
189SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
190 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
191 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
192 "");
193SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
194 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
195 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
196 "");
197SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
198 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
199 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
200 "");
201SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
202 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
203 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
204 "");
205SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
206 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
207 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
208 "");
209SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
210 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
211 &VNET_NAME(ipport_reservedhigh), 0, "");
212SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
213 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
214SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
215 CTLFLAG_VNET | CTLFLAG_RW,
216 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
217SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
218 CTLFLAG_VNET | CTLFLAG_RW,
219 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
220 "allocations before switching to a sequential one");
221SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
222 CTLFLAG_VNET | CTLFLAG_RW,
223 &VNET_NAME(ipport_randomtime), 0,
224 "Minimum time to keep sequential port "
225 "allocation before switching to a random one");
226
227#ifdef RATELIMIT
228counter_u64_t rate_limit_new;
229counter_u64_t rate_limit_chg;
230counter_u64_t rate_limit_active;
231counter_u64_t rate_limit_alloc_fail;
232counter_u64_t rate_limit_set_ok;
233
234static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
235 "IP Rate Limiting");
236SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
237 &rate_limit_active, "Active rate limited connections");
238SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
239 &rate_limit_alloc_fail, "Rate limited connection failures");
240SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
241 &rate_limit_set_ok, "Rate limited setting succeeded");
242SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
243 &rate_limit_new, "Total Rate limit new attempts");
244SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
245 &rate_limit_chg, "Total Rate limited change attempts");
246
247#endif /* RATELIMIT */
248
249#endif /* INET */
250
251VNET_DEFINE(uint32_t, in_pcbhashseed);
252static void
254{
255
256 V_in_pcbhashseed = arc4random();
257}
258VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
260
261/*
262 * in_pcb.c: manage the Protocol Control Blocks.
263 *
264 * NOTE: It is assumed that most of these functions will be called with
265 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
266 * functions often modify hash chains or addresses in pcbs.
267 */
268
269static struct inpcblbgroup *
270in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
271 uint16_t port, const union in_dependaddr *addr, int size,
272 uint8_t numa_domain)
273{
274 struct inpcblbgroup *grp;
275 size_t bytes;
276
277 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
278 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
279 if (!grp)
280 return (NULL);
281 grp->il_vflag = vflag;
282 grp->il_lport = port;
283 grp->il_numa_domain = numa_domain;
284 grp->il_dependladdr = *addr;
285 grp->il_inpsiz = size;
286 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
287 return (grp);
288}
289
290static void
291in_pcblbgroup_free_deferred(epoch_context_t ctx)
292{
293 struct inpcblbgroup *grp;
294
295 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
296 free(grp, M_PCB);
297}
298
299static void
301{
302
303 CK_LIST_REMOVE(grp, il_list);
304 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
305}
306
307static struct inpcblbgroup *
308in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
309 struct inpcblbgroup *old_grp, int size)
310{
311 struct inpcblbgroup *grp;
312 int i;
313
314 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
315 old_grp->il_lport, &old_grp->il_dependladdr, size,
316 old_grp->il_numa_domain);
317 if (grp == NULL)
318 return (NULL);
319
320 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
321 ("invalid new local group size %d and old local group count %d",
322 grp->il_inpsiz, old_grp->il_inpcnt));
323
324 for (i = 0; i < old_grp->il_inpcnt; ++i)
325 grp->il_inp[i] = old_grp->il_inp[i];
326 grp->il_inpcnt = old_grp->il_inpcnt;
327 in_pcblbgroup_free(old_grp);
328 return (grp);
329}
330
331/*
332 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
333 * and shrink group if possible.
334 */
335static void
336in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
337 int i)
338{
339 struct inpcblbgroup *grp, *new_grp;
340
341 grp = *grpp;
342 for (; i + 1 < grp->il_inpcnt; ++i)
343 grp->il_inp[i] = grp->il_inp[i + 1];
344 grp->il_inpcnt--;
345
346 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
347 grp->il_inpcnt <= grp->il_inpsiz / 4) {
348 /* Shrink this group. */
349 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
350 if (new_grp != NULL)
351 *grpp = new_grp;
352 }
353}
354
355/*
356 * Add PCB to load balance group for SO_REUSEPORT_LB option.
357 */
358static int
359in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
360{
361 const static struct timeval interval = { 60, 0 };
362 static struct timeval lastprint;
363 struct inpcbinfo *pcbinfo;
364 struct inpcblbgrouphead *hdr;
365 struct inpcblbgroup *grp;
366 uint32_t idx;
367
368 pcbinfo = inp->inp_pcbinfo;
369
370 INP_WLOCK_ASSERT(inp);
371 INP_HASH_WLOCK_ASSERT(pcbinfo);
372
373 /*
374 * Don't allow jailed socket to join local group.
375 */
376 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
377 return (0);
378
379#ifdef INET6
380 /*
381 * Don't allow IPv4 mapped INET6 wild socket.
382 */
383 if ((inp->inp_vflag & INP_IPV4) &&
384 inp->inp_laddr.s_addr == INADDR_ANY &&
385 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
386 return (0);
387 }
388#endif
389
390 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
391 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
392 CK_LIST_FOREACH(grp, hdr, il_list) {
393 if (grp->il_vflag == inp->inp_vflag &&
394 grp->il_lport == inp->inp_lport &&
395 grp->il_numa_domain == numa_domain &&
396 memcmp(&grp->il_dependladdr,
398 sizeof(grp->il_dependladdr)) == 0)
399 break;
400 }
401 if (grp == NULL) {
402 /* Create new load balance group. */
403 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
404 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
405 INPCBLBGROUP_SIZMIN, numa_domain);
406 if (grp == NULL)
407 return (ENOBUFS);
408 } else if (grp->il_inpcnt == grp->il_inpsiz) {
409 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
410 if (ratecheck(&lastprint, &interval))
411 printf("lb group port %d, limit reached\n",
412 ntohs(grp->il_lport));
413 return (0);
414 }
415
416 /* Expand this local group. */
417 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
418 if (grp == NULL)
419 return (ENOBUFS);
420 }
421
422 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
423 ("invalid local group size %d and count %d", grp->il_inpsiz,
424 grp->il_inpcnt));
425
426 grp->il_inp[grp->il_inpcnt] = inp;
427 grp->il_inpcnt++;
428 return (0);
429}
430
431/*
432 * Remove PCB from load balance group.
433 */
434static void
436{
437 struct inpcbinfo *pcbinfo;
438 struct inpcblbgrouphead *hdr;
439 struct inpcblbgroup *grp;
440 int i;
441
442 pcbinfo = inp->inp_pcbinfo;
443
444 INP_WLOCK_ASSERT(inp);
445 INP_HASH_WLOCK_ASSERT(pcbinfo);
446
447 hdr = &pcbinfo->ipi_lbgrouphashbase[
448 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
449 CK_LIST_FOREACH(grp, hdr, il_list) {
450 for (i = 0; i < grp->il_inpcnt; ++i) {
451 if (grp->il_inp[i] != inp)
452 continue;
453
454 if (grp->il_inpcnt == 1) {
455 /* We are the last, free this local group. */
457 } else {
458 /* Pull up inpcbs, shrink group if possible. */
459 in_pcblbgroup_reorder(hdr, &grp, i);
460 }
461 return;
462 }
463 }
464}
465
466int
467in_pcblbgroup_numa(struct inpcb *inp, int arg)
468{
469 struct inpcbinfo *pcbinfo;
470 struct inpcblbgrouphead *hdr;
471 struct inpcblbgroup *grp;
472 int err, i;
473 uint8_t numa_domain;
474
475 switch (arg) {
477 numa_domain = M_NODOM;
478 break;
480 numa_domain = PCPU_GET(domain);
481 break;
482 default:
483 if (arg < 0 || arg >= vm_ndomains)
484 return (EINVAL);
485 numa_domain = arg;
486 }
487
488 err = 0;
489 pcbinfo = inp->inp_pcbinfo;
490 INP_WLOCK_ASSERT(inp);
491 INP_HASH_WLOCK(pcbinfo);
492 hdr = &pcbinfo->ipi_lbgrouphashbase[
493 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
494 CK_LIST_FOREACH(grp, hdr, il_list) {
495 for (i = 0; i < grp->il_inpcnt; ++i) {
496 if (grp->il_inp[i] != inp)
497 continue;
498
499 if (grp->il_numa_domain == numa_domain) {
500 goto abort_with_hash_wlock;
501 }
502
503 /* Remove it from the old group. */
505
506 /* Add it to the new group based on numa domain. */
507 in_pcbinslbgrouphash(inp, numa_domain);
508 goto abort_with_hash_wlock;
509 }
510 }
511 err = ENOENT;
512abort_with_hash_wlock:
513 INP_HASH_WUNLOCK(pcbinfo);
514 return (err);
515}
516
517/* Make sure it is safe to use hashinit(9) on CK_LIST. */
518CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
519
520/*
521 * Initialize an inpcbinfo - a per-VNET instance of connections db.
522 */
523void
524in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
525 u_int hash_nelements, u_int porthash_nelements)
526{
527
528 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
529 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
530 NULL, MTX_DEF);
531#ifdef VIMAGE
532 pcbinfo->ipi_vnet = curvnet;
533#endif
534 CK_LIST_INIT(&pcbinfo->ipi_listhead);
535 pcbinfo->ipi_count = 0;
536 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
537 &pcbinfo->ipi_hashmask);
538 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
539 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
540 &pcbinfo->ipi_porthashmask);
541 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
542 &pcbinfo->ipi_lbgrouphashmask);
543 pcbinfo->ipi_zone = pcbstor->ips_zone;
544 pcbinfo->ipi_portzone = pcbstor->ips_portzone;
545 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
546}
547
548/*
549 * Destroy an inpcbinfo.
550 */
551void
553{
554
555 KASSERT(pcbinfo->ipi_count == 0,
556 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
557
558 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
559 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
560 pcbinfo->ipi_porthashmask);
561 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
562 pcbinfo->ipi_lbgrouphashmask);
563 mtx_destroy(&pcbinfo->ipi_hash_lock);
564 mtx_destroy(&pcbinfo->ipi_lock);
565}
566
567/*
568 * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
569 */
570static void inpcb_dtor(void *, int, void *);
571static void inpcb_fini(void *, int);
572void
574{
575 struct inpcbstorage *pcbstor = arg;
576
577 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
578 sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
579 inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
580 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
581 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
582 uma_zone_set_smr(pcbstor->ips_portzone,
583 uma_zone_get_smr(pcbstor->ips_zone));
584}
585
586/*
587 * Destroy a pcbstorage - used by unloadable protocols.
588 */
589void
591{
592 struct inpcbstorage *pcbstor = arg;
593
594 uma_zdestroy(pcbstor->ips_zone);
595 uma_zdestroy(pcbstor->ips_portzone);
596}
597
598/*
599 * Allocate a PCB and associate it with the socket.
600 * On success return with the PCB locked.
601 */
602int
603in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
604{
605 struct inpcb *inp;
606 int error;
607
608 error = 0;
609 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
610 if (inp == NULL)
611 return (ENOBUFS);
612 bzero(&inp->inp_start_zero, inp_zero_size);
613#ifdef NUMA
614 inp->inp_numa_domain = M_NODOM;
615#endif
616 inp->inp_pcbinfo = pcbinfo;
617 inp->inp_socket = so;
618 inp->inp_cred = crhold(so->so_cred);
619 inp->inp_inc.inc_fibnum = so->so_fibnum;
620#ifdef MAC
621 error = mac_inpcb_init(inp, M_NOWAIT);
622 if (error != 0)
623 goto out;
624 mac_inpcb_create(so, inp);
625#endif
626#if defined(IPSEC) || defined(IPSEC_SUPPORT)
627 error = ipsec_init_pcbpolicy(inp);
628 if (error != 0) {
629#ifdef MAC
630 mac_inpcb_destroy(inp);
631#endif
632 goto out;
633 }
634#endif /*IPSEC*/
635#ifdef INET6
636 if (INP_SOCKAF(so) == AF_INET6) {
637 inp->inp_vflag |= INP_IPV6PROTO;
638 if (V_ip6_v6only)
640 }
641 if (V_ip6_auto_flowlabel)
643#endif
644 /*
645 * Routes in inpcb's can cache L2 as well; they are guaranteed
646 * to be cleaned up.
647 */
648 inp->inp_route.ro_flags = RT_LLE_CACHE;
649#ifdef TCPHPTS
650 /*
651 * If using hpts lets drop a random number in so
652 * not all new connections fall on the same CPU.
653 */
654 inp->inp_hpts_cpu = hpts_random_cpu(inp);
655#endif
656 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
657 INP_WLOCK(inp);
658 INP_INFO_WLOCK(pcbinfo);
659 pcbinfo->ipi_count++;
660 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
661 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
662 INP_INFO_WUNLOCK(pcbinfo);
663 so->so_pcb = inp;
664
665 return (0);
666
667#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
668out:
669 uma_zfree_smr(pcbinfo->ipi_zone, inp);
670 return (error);
671#endif
672}
673
674#ifdef INET
675int
676in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
677{
678 int anonport, error;
679
680 KASSERT(nam == NULL || nam->sa_family == AF_INET,
681 ("%s: invalid address family for %p", __func__, nam));
682 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
683 ("%s: invalid address length for %p", __func__, nam));
684 INP_WLOCK_ASSERT(inp);
686
687 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
688 return (EINVAL);
689 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
690 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
691 &inp->inp_lport, cred);
692 if (error)
693 return (error);
694 if (in_pcbinshash(inp) != 0) {
695 inp->inp_laddr.s_addr = INADDR_ANY;
696 inp->inp_lport = 0;
697 return (EAGAIN);
698 }
699 if (anonport)
700 inp->inp_flags |= INP_ANONPORT;
701 return (0);
702}
703#endif
704
705#if defined(INET) || defined(INET6)
706/*
707 * Assign a local port like in_pcb_lport(), but also used with connect()
708 * and a foreign address and port. If fsa is non-NULL, choose a local port
709 * that is unused with those, otherwise one that is completely unused.
710 * lsa can be NULL for IPv6.
711 */
712int
713in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
714 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
715{
716 struct inpcbinfo *pcbinfo;
717 struct inpcb *tmpinp;
718 unsigned short *lastport;
719 int count, dorandom, error;
720 u_short aux, first, last, lport;
721#ifdef INET
722 struct in_addr laddr, faddr;
723#endif
724#ifdef INET6
725 struct in6_addr *laddr6, *faddr6;
726#endif
727
728 pcbinfo = inp->inp_pcbinfo;
729
730 /*
731 * Because no actual state changes occur here, a global write lock on
732 * the pcbinfo isn't required.
733 */
734 INP_LOCK_ASSERT(inp);
735 INP_HASH_LOCK_ASSERT(pcbinfo);
736
737 if (inp->inp_flags & INP_HIGHPORT) {
738 first = V_ipport_hifirstauto; /* sysctl */
739 last = V_ipport_hilastauto;
740 lastport = &pcbinfo->ipi_lasthi;
741 } else if (inp->inp_flags & INP_LOWPORT) {
742 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
743 if (error)
744 return (error);
745 first = V_ipport_lowfirstauto; /* 1023 */
746 last = V_ipport_lowlastauto; /* 600 */
747 lastport = &pcbinfo->ipi_lastlow;
748 } else {
749 first = V_ipport_firstauto; /* sysctl */
750 last = V_ipport_lastauto;
751 lastport = &pcbinfo->ipi_lastport;
752 }
753 /*
754 * For UDP(-Lite), use random port allocation as long as the user
755 * allows it. For TCP (and as of yet unknown) connections,
756 * use random port allocation only if the user allows it AND
757 * ipport_tick() allows it.
758 */
760 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
761 pcbinfo == &V_ulitecbinfo))
762 dorandom = 1;
763 else
764 dorandom = 0;
765 /*
766 * It makes no sense to do random port allocation if
767 * we have the only port available.
768 */
769 if (first == last)
770 dorandom = 0;
771 /* Make sure to not include UDP(-Lite) packets in the count. */
772 if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo)
774 /*
775 * Instead of having two loops further down counting up or down
776 * make sure that first is always <= last and go with only one
777 * code path implementing all logic.
778 */
779 if (first > last) {
780 aux = first;
781 first = last;
782 last = aux;
783 }
784
785#ifdef INET
786 laddr.s_addr = INADDR_ANY;
787 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
788 if (lsa != NULL)
789 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
790 if (fsa != NULL)
791 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
792 }
793#endif
794#ifdef INET6
795 laddr6 = NULL;
796 if ((inp->inp_vflag & INP_IPV6) != 0) {
797 if (lsa != NULL)
798 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
799 if (fsa != NULL)
800 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
801 }
802#endif
803
804 tmpinp = NULL;
805 lport = *lportp;
806
807 if (dorandom)
808 *lastport = first + (arc4random() % (last - first));
809
810 count = last - first;
811
812 do {
813 if (count-- < 0) /* completely used? */
814 return (EADDRNOTAVAIL);
815 ++*lastport;
816 if (*lastport < first || *lastport > last)
817 *lastport = first;
818 lport = htons(*lastport);
819
820 if (fsa != NULL) {
821#ifdef INET
822 if (lsa->sa_family == AF_INET) {
823 tmpinp = in_pcblookup_hash_locked(pcbinfo,
824 faddr, fport, laddr, lport, lookupflags,
825 NULL, M_NODOM);
826 }
827#endif
828#ifdef INET6
829 if (lsa->sa_family == AF_INET6) {
830 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
831 faddr6, fport, laddr6, lport, lookupflags,
832 NULL, M_NODOM);
833 }
834#endif
835 } else {
836#ifdef INET6
837 if ((inp->inp_vflag & INP_IPV6) != 0)
838 tmpinp = in6_pcblookup_local(pcbinfo,
839 &inp->in6p_laddr, lport, lookupflags, cred);
840#endif
841#if defined(INET) && defined(INET6)
842 else
843#endif
844#ifdef INET
845 tmpinp = in_pcblookup_local(pcbinfo, laddr,
846 lport, lookupflags, cred);
847#endif
848 }
849 } while (tmpinp != NULL);
850
851 *lportp = lport;
852
853 return (0);
854}
855
856/*
857 * Select a local port (number) to use.
858 */
859int
860in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
861 struct ucred *cred, int lookupflags)
862{
863 struct sockaddr_in laddr;
864
865 if (laddrp) {
866 bzero(&laddr, sizeof(laddr));
867 laddr.sin_family = AF_INET;
868 laddr.sin_addr = *laddrp;
869 }
870 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
871 NULL, lportp, NULL, 0, cred, lookupflags));
872}
873
874/*
875 * Return cached socket options.
876 */
877int
878inp_so_options(const struct inpcb *inp)
879{
880 int so_options;
881
882 so_options = 0;
883
884 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
885 so_options |= SO_REUSEPORT_LB;
886 if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
887 so_options |= SO_REUSEPORT;
888 if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
889 so_options |= SO_REUSEADDR;
890 return (so_options);
891}
892#endif /* INET || INET6 */
893
894/*
895 * Check if a new BINDMULTI socket is allowed to be created.
896 *
897 * ni points to the new inp.
898 * oi points to the exisitng inp.
899 *
900 * This checks whether the existing inp also has BINDMULTI and
901 * whether the credentials match.
902 */
903int
904in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
905{
906 /* Check permissions match */
907 if ((ni->inp_flags2 & INP_BINDMULTI) &&
908 (ni->inp_cred->cr_uid !=
909 oi->inp_cred->cr_uid))
910 return (0);
911
912 /* Check the existing inp has BINDMULTI set */
913 if ((ni->inp_flags2 & INP_BINDMULTI) &&
914 ((oi->inp_flags2 & INP_BINDMULTI) == 0))
915 return (0);
916
917 /*
918 * We're okay - either INP_BINDMULTI isn't set on ni, or
919 * it is and it matches the checks.
920 */
921 return (1);
922}
923
924#ifdef INET
925/*
926 * Set up a bind operation on a PCB, performing port allocation
927 * as required, but do not actually modify the PCB. Callers can
928 * either complete the bind by setting inp_laddr/inp_lport and
929 * calling in_pcbinshash(), or they can just use the resulting
930 * port and address to authorise the sending of a once-off packet.
931 *
932 * On error, the values of *laddrp and *lportp are not changed.
933 */
934int
935in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
936 u_short *lportp, struct ucred *cred)
937{
938 struct socket *so = inp->inp_socket;
939 struct sockaddr_in *sin;
940 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
941 struct in_addr laddr;
942 u_short lport = 0;
943 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
944 int error;
945
946 /*
947 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
948 * so that we don't have to add to the (already messy) code below.
949 */
950 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
951
952 /*
953 * No state changes, so read locks are sufficient here.
954 */
955 INP_LOCK_ASSERT(inp);
956 INP_HASH_LOCK_ASSERT(pcbinfo);
957
958 laddr.s_addr = *laddrp;
959 if (nam != NULL && laddr.s_addr != INADDR_ANY)
960 return (EINVAL);
961 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
962 lookupflags = INPLOOKUP_WILDCARD;
963 if (nam == NULL) {
964 if ((error = prison_local_ip4(cred, &laddr)) != 0)
965 return (error);
966 } else {
967 sin = (struct sockaddr_in *)nam;
968 KASSERT(sin->sin_family == AF_INET,
969 ("%s: invalid family for address %p", __func__, sin));
970 KASSERT(sin->sin_len == sizeof(*sin),
971 ("%s: invalid length for address %p", __func__, sin));
972
973 error = prison_local_ip4(cred, &sin->sin_addr);
974 if (error)
975 return (error);
976 if (sin->sin_port != *lportp) {
977 /* Don't allow the port to change. */
978 if (*lportp != 0)
979 return (EINVAL);
980 lport = sin->sin_port;
981 }
982 /* NB: lport is left as 0 if the port isn't being changed. */
983 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
984 /*
985 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
986 * allow complete duplication of binding if
987 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
988 * and a multicast address is bound on both
989 * new and duplicated sockets.
990 */
991 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
992 reuseport = SO_REUSEADDR|SO_REUSEPORT;
993 /*
994 * XXX: How to deal with SO_REUSEPORT_LB here?
995 * Treat same as SO_REUSEPORT for now.
996 */
997 if ((so->so_options &
998 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
999 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
1000 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
1001 sin->sin_port = 0; /* yech... */
1002 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
1003 /*
1004 * Is the address a local IP address?
1005 * If INP_BINDANY is set, then the socket may be bound
1006 * to any endpoint address, local or not.
1007 */
1008 if ((inp->inp_flags & INP_BINDANY) == 0 &&
1009 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
1010 return (EADDRNOTAVAIL);
1011 }
1012 laddr = sin->sin_addr;
1013 if (lport) {
1014 struct inpcb *t;
1015 struct tcptw *tw;
1016
1017 /* GROSS */
1018 if (ntohs(lport) <= V_ipport_reservedhigh &&
1019 ntohs(lport) >= V_ipport_reservedlow &&
1020 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
1021 return (EACCES);
1022 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
1023 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
1024 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1025 lport, INPLOOKUP_WILDCARD, cred);
1026 /*
1027 * XXX
1028 * This entire block sorely needs a rewrite.
1029 */
1030 if (t &&
1031 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1032 ((t->inp_flags & INP_TIMEWAIT) == 0) &&
1033 (so->so_type != SOCK_STREAM ||
1034 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
1035 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
1036 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
1037 (t->inp_flags2 & INP_REUSEPORT) ||
1038 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
1039 (inp->inp_cred->cr_uid !=
1040 t->inp_cred->cr_uid))
1041 return (EADDRINUSE);
1042
1043 /*
1044 * If the socket is a BINDMULTI socket, then
1045 * the credentials need to match and the
1046 * original socket also has to have been bound
1047 * with BINDMULTI.
1048 */
1049 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1050 return (EADDRINUSE);
1051 }
1052 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1053 lport, lookupflags, cred);
1054 if (t && (t->inp_flags & INP_TIMEWAIT)) {
1055 /*
1056 * XXXRW: If an incpb has had its timewait
1057 * state recycled, we treat the address as
1058 * being in use (for now). This is better
1059 * than a panic, but not desirable.
1060 */
1061 tw = intotw(t);
1062 if (tw == NULL ||
1063 ((reuseport & tw->tw_so_options) == 0 &&
1064 (reuseport_lb &
1065 tw->tw_so_options) == 0)) {
1066 return (EADDRINUSE);
1067 }
1068 } else if (t &&
1069 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1070 (reuseport & inp_so_options(t)) == 0 &&
1071 (reuseport_lb & inp_so_options(t)) == 0) {
1072#ifdef INET6
1073 if (ntohl(sin->sin_addr.s_addr) !=
1074 INADDR_ANY ||
1075 ntohl(t->inp_laddr.s_addr) !=
1076 INADDR_ANY ||
1077 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
1078 (t->inp_vflag & INP_IPV6PROTO) == 0)
1079#endif
1080 return (EADDRINUSE);
1081 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1082 return (EADDRINUSE);
1083 }
1084 }
1085 }
1086 if (*lportp != 0)
1087 lport = *lportp;
1088 if (lport == 0) {
1089 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1090 if (error != 0)
1091 return (error);
1092 }
1093 *laddrp = laddr.s_addr;
1094 *lportp = lport;
1095 return (0);
1096}
1097
1098/*
1099 * Connect from a socket to a specified address.
1100 * Both address and port must be specified in argument sin.
1101 * If don't have a local address for this socket yet,
1102 * then pick one.
1103 */
1104int
1105in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred,
1106 bool rehash)
1107{
1108 u_short lport, fport;
1109 in_addr_t laddr, faddr;
1110 int anonport, error;
1111
1112 INP_WLOCK_ASSERT(inp);
1114
1115 lport = inp->inp_lport;
1116 laddr = inp->inp_laddr.s_addr;
1117 anonport = (lport == 0);
1118 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1119 NULL, cred);
1120 if (error)
1121 return (error);
1122
1123 /* Do the initial binding of the local address if required. */
1124 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1125 KASSERT(rehash == true,
1126 ("Rehashing required for unbound inps"));
1127 inp->inp_lport = lport;
1128 inp->inp_laddr.s_addr = laddr;
1129 if (in_pcbinshash(inp) != 0) {
1130 inp->inp_laddr.s_addr = INADDR_ANY;
1131 inp->inp_lport = 0;
1132 return (EAGAIN);
1133 }
1134 }
1135
1136 /* Commit the remaining changes. */
1137 inp->inp_lport = lport;
1138 inp->inp_laddr.s_addr = laddr;
1139 inp->inp_faddr.s_addr = faddr;
1140 inp->inp_fport = fport;
1141 if (rehash) {
1142 in_pcbrehash(inp);
1143 } else {
1144 in_pcbinshash(inp);
1145 }
1146
1147 if (anonport)
1148 inp->inp_flags |= INP_ANONPORT;
1149 return (0);
1150}
1151
1152/*
1153 * Do proper source address selection on an unbound socket in case
1154 * of connect. Take jails into account as well.
1155 */
1156int
1157in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1158 struct ucred *cred)
1159{
1160 struct ifaddr *ifa;
1161 struct sockaddr *sa;
1162 struct sockaddr_in *sin, dst;
1163 struct nhop_object *nh;
1164 int error;
1165
1166 NET_EPOCH_ASSERT();
1167 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1168 /*
1169 * Bypass source address selection and use the primary jail IP
1170 * if requested.
1171 */
1172 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1173 return (0);
1174
1175 error = 0;
1176
1177 nh = NULL;
1178 bzero(&dst, sizeof(dst));
1179 sin = &dst;
1180 sin->sin_family = AF_INET;
1181 sin->sin_len = sizeof(struct sockaddr_in);
1182 sin->sin_addr.s_addr = faddr->s_addr;
1183
1184 /*
1185 * If route is known our src addr is taken from the i/f,
1186 * else punt.
1187 *
1188 * Find out route to destination.
1189 */
1190 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1191 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1192 0, NHR_NONE, 0);
1193
1194 /*
1195 * If we found a route, use the address corresponding to
1196 * the outgoing interface.
1197 *
1198 * Otherwise assume faddr is reachable on a directly connected
1199 * network and try to find a corresponding interface to take
1200 * the source address from.
1201 */
1202 if (nh == NULL || nh->nh_ifp == NULL) {
1203 struct in_ifaddr *ia;
1204 struct ifnet *ifp;
1205
1206 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1207 inp->inp_socket->so_fibnum));
1208 if (ia == NULL) {
1209 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1210 inp->inp_socket->so_fibnum));
1211 }
1212 if (ia == NULL) {
1213 error = ENETUNREACH;
1214 goto done;
1215 }
1216
1217 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1218 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1219 goto done;
1220 }
1221
1222 ifp = ia->ia_ifp;
1223 ia = NULL;
1224 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1225 sa = ifa->ifa_addr;
1226 if (sa->sa_family != AF_INET)
1227 continue;
1228 sin = (struct sockaddr_in *)sa;
1229 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1230 ia = (struct in_ifaddr *)ifa;
1231 break;
1232 }
1233 }
1234 if (ia != NULL) {
1235 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1236 goto done;
1237 }
1238
1239 /* 3. As a last resort return the 'default' jail address. */
1240 error = prison_get_ip4(cred, laddr);
1241 goto done;
1242 }
1243
1244 /*
1245 * If the outgoing interface on the route found is not
1246 * a loopback interface, use the address from that interface.
1247 * In case of jails do those three steps:
1248 * 1. check if the interface address belongs to the jail. If so use it.
1249 * 2. check if we have any address on the outgoing interface
1250 * belonging to this jail. If so use it.
1251 * 3. as a last resort return the 'default' jail address.
1252 */
1253 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1254 struct in_ifaddr *ia;
1255 struct ifnet *ifp;
1256
1257 /* If not jailed, use the default returned. */
1258 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1259 ia = (struct in_ifaddr *)nh->nh_ifa;
1260 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1261 goto done;
1262 }
1263
1264 /* Jailed. */
1265 /* 1. Check if the iface address belongs to the jail. */
1266 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1267 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1268 ia = (struct in_ifaddr *)nh->nh_ifa;
1269 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1270 goto done;
1271 }
1272
1273 /*
1274 * 2. Check if we have any address on the outgoing interface
1275 * belonging to this jail.
1276 */
1277 ia = NULL;
1278 ifp = nh->nh_ifp;
1279 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1280 sa = ifa->ifa_addr;
1281 if (sa->sa_family != AF_INET)
1282 continue;
1283 sin = (struct sockaddr_in *)sa;
1284 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1285 ia = (struct in_ifaddr *)ifa;
1286 break;
1287 }
1288 }
1289 if (ia != NULL) {
1290 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1291 goto done;
1292 }
1293
1294 /* 3. As a last resort return the 'default' jail address. */
1295 error = prison_get_ip4(cred, laddr);
1296 goto done;
1297 }
1298
1299 /*
1300 * The outgoing interface is marked with 'loopback net', so a route
1301 * to ourselves is here.
1302 * Try to find the interface of the destination address and then
1303 * take the address from there. That interface is not necessarily
1304 * a loopback interface.
1305 * In case of jails, check that it is an address of the jail
1306 * and if we cannot find, fall back to the 'default' jail address.
1307 */
1308 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1309 struct in_ifaddr *ia;
1310
1311 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1312 inp->inp_socket->so_fibnum));
1313 if (ia == NULL)
1314 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1315 inp->inp_socket->so_fibnum));
1316 if (ia == NULL)
1317 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1318
1319 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1320 if (ia == NULL) {
1321 error = ENETUNREACH;
1322 goto done;
1323 }
1324 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1325 goto done;
1326 }
1327
1328 /* Jailed. */
1329 if (ia != NULL) {
1330 struct ifnet *ifp;
1331
1332 ifp = ia->ia_ifp;
1333 ia = NULL;
1334 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1335 sa = ifa->ifa_addr;
1336 if (sa->sa_family != AF_INET)
1337 continue;
1338 sin = (struct sockaddr_in *)sa;
1339 if (prison_check_ip4(cred,
1340 &sin->sin_addr) == 0) {
1341 ia = (struct in_ifaddr *)ifa;
1342 break;
1343 }
1344 }
1345 if (ia != NULL) {
1346 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1347 goto done;
1348 }
1349 }
1350
1351 /* 3. As a last resort return the 'default' jail address. */
1352 error = prison_get_ip4(cred, laddr);
1353 goto done;
1354 }
1355
1356done:
1357 return (error);
1358}
1359
1360/*
1361 * Set up for a connect from a socket to the specified address.
1362 * On entry, *laddrp and *lportp should contain the current local
1363 * address and port for the PCB; these are updated to the values
1364 * that should be placed in inp_laddr and inp_lport to complete
1365 * the connect.
1366 *
1367 * On success, *faddrp and *fportp will be set to the remote address
1368 * and port. These are not updated in the error case.
1369 *
1370 * If the operation fails because the connection already exists,
1371 * *oinpp will be set to the PCB of that connection so that the
1372 * caller can decide to override it. In all other cases, *oinpp
1373 * is set to NULL.
1374 */
1375int
1376in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1377 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1378 struct inpcb **oinpp, struct ucred *cred)
1379{
1380 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1381 struct in_ifaddr *ia;
1382 struct inpcb *oinp;
1383 struct in_addr laddr, faddr;
1384 u_short lport, fport;
1385 int error;
1386
1387 KASSERT(sin->sin_family == AF_INET,
1388 ("%s: invalid address family for %p", __func__, sin));
1389 KASSERT(sin->sin_len == sizeof(*sin),
1390 ("%s: invalid address length for %p", __func__, sin));
1391
1392 /*
1393 * Because a global state change doesn't actually occur here, a read
1394 * lock is sufficient.
1395 */
1396 NET_EPOCH_ASSERT();
1397 INP_LOCK_ASSERT(inp);
1399
1400 if (oinpp != NULL)
1401 *oinpp = NULL;
1402 if (sin->sin_port == 0)
1403 return (EADDRNOTAVAIL);
1404 laddr.s_addr = *laddrp;
1405 lport = *lportp;
1406 faddr = sin->sin_addr;
1407 fport = sin->sin_port;
1408#ifdef ROUTE_MPATH
1409 if (CALC_FLOWID_OUTBOUND) {
1410 uint32_t hash_val, hash_type;
1411
1412 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1413 inp->inp_socket->so_proto->pr_protocol, &hash_type);
1414
1415 inp->inp_flowid = hash_val;
1416 inp->inp_flowtype = hash_type;
1417 }
1418#endif
1419 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1420 /*
1421 * If the destination address is INADDR_ANY,
1422 * use the primary local address.
1423 * If the supplied address is INADDR_BROADCAST,
1424 * and the primary interface supports broadcast,
1425 * choose the broadcast address for that interface.
1426 */
1427 if (faddr.s_addr == INADDR_ANY) {
1428 faddr =
1429 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1430 if (cred != NULL &&
1431 (error = prison_get_ip4(cred, &faddr)) != 0)
1432 return (error);
1433 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1434 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1435 IFF_BROADCAST)
1436 faddr = satosin(&CK_STAILQ_FIRST(
1437 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1438 }
1439 }
1440 if (laddr.s_addr == INADDR_ANY) {
1441 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1442 /*
1443 * If the destination address is multicast and an outgoing
1444 * interface has been set as a multicast option, prefer the
1445 * address of that interface as our source address.
1446 */
1447 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1448 inp->inp_moptions != NULL) {
1449 struct ip_moptions *imo;
1450 struct ifnet *ifp;
1451
1452 imo = inp->inp_moptions;
1453 if (imo->imo_multicast_ifp != NULL) {
1454 ifp = imo->imo_multicast_ifp;
1455 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1456 if ((ia->ia_ifp == ifp) &&
1457 (cred == NULL ||
1458 prison_check_ip4(cred,
1459 &ia->ia_addr.sin_addr) == 0))
1460 break;
1461 }
1462 if (ia == NULL)
1463 error = EADDRNOTAVAIL;
1464 else {
1465 laddr = ia->ia_addr.sin_addr;
1466 error = 0;
1467 }
1468 }
1469 }
1470 if (error)
1471 return (error);
1472 }
1473
1474 if (lport != 0) {
1475 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1476 fport, laddr, lport, 0, NULL, M_NODOM);
1477 if (oinp != NULL) {
1478 if (oinpp != NULL)
1479 *oinpp = oinp;
1480 return (EADDRINUSE);
1481 }
1482 } else {
1483 struct sockaddr_in lsin, fsin;
1484
1485 bzero(&lsin, sizeof(lsin));
1486 bzero(&fsin, sizeof(fsin));
1487 lsin.sin_family = AF_INET;
1488 lsin.sin_addr = laddr;
1489 fsin.sin_family = AF_INET;
1490 fsin.sin_addr = faddr;
1491 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1492 &lport, (struct sockaddr *)& fsin, fport, cred,
1494 if (error)
1495 return (error);
1496 }
1497 *laddrp = laddr.s_addr;
1498 *lportp = lport;
1499 *faddrp = faddr.s_addr;
1500 *fportp = fport;
1501 return (0);
1502}
1503
1504void
1505in_pcbdisconnect(struct inpcb *inp)
1506{
1507
1508 INP_WLOCK_ASSERT(inp);
1510
1511 inp->inp_faddr.s_addr = INADDR_ANY;
1512 inp->inp_fport = 0;
1513 in_pcbrehash(inp);
1514}
1515#endif /* INET */
1516
1517/*
1518 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1519 * For most protocols, this will be invoked immediately prior to calling
1520 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the
1521 * socket, in which case in_pcbfree() is deferred.
1522 */
1523void
1525{
1526
1527 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1528
1529#ifdef RATELIMIT
1530 if (inp->inp_snd_tag != NULL)
1531 in_pcbdetach_txrtlmt(inp);
1532#endif
1533 inp->inp_socket->so_pcb = NULL;
1534 inp->inp_socket = NULL;
1535}
1536
1537/*
1538 * inpcb hash lookups are protected by SMR section.
1539 *
1540 * Once desired pcb has been found, switching from SMR section to a pcb
1541 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1542 * here because SMR is a critical section.
1543 * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1544 */
1545static inline void
1546inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1547{
1548
1549 lock == INPLOOKUP_RLOCKPCB ?
1550 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1551}
1552
1553static inline void
1554inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1555{
1556
1557 lock == INPLOOKUP_RLOCKPCB ?
1558 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1559}
1560
1561static inline int
1562inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1563{
1564
1565 return (lock == INPLOOKUP_RLOCKPCB ?
1566 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1567}
1568
1569static inline bool
1570in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1571{
1572
1573 return (lock == INPLOOKUP_RLOCKPCB ?
1575}
1576
1577bool
1578inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1579{
1580
1581 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1582 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1583
1584 if (__predict_true(inp_trylock(inp, lock))) {
1585 if (__predict_false(inp->inp_flags & INP_FREED)) {
1586 smr_exit(inp->inp_pcbinfo->ipi_smr);
1587 inp_unlock(inp, lock);
1588 return (false);
1589 }
1590 smr_exit(inp->inp_pcbinfo->ipi_smr);
1591 return (true);
1592 }
1593
1594 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1595 smr_exit(inp->inp_pcbinfo->ipi_smr);
1596 inp_lock(inp, lock);
1597 if (__predict_false(in_pcbrele(inp, lock)))
1598 return (false);
1599 /*
1600 * inp acquired through refcount & lock for sure didn't went
1601 * through uma_zfree(). However, it may have already went
1602 * through in_pcbfree() and has another reference, that
1603 * prevented its release by our in_pcbrele().
1604 */
1605 if (__predict_false(inp->inp_flags & INP_FREED)) {
1606 inp_unlock(inp, lock);
1607 return (false);
1608 }
1609 return (true);
1610 } else {
1611 smr_exit(inp->inp_pcbinfo->ipi_smr);
1612 return (false);
1613 }
1614}
1615
1616/*
1617 * inp_next() - inpcb hash/list traversal iterator
1618 *
1619 * Requires initialized struct inpcb_iterator for context.
1620 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1621 *
1622 * - Iterator can have either write-lock or read-lock semantics, that can not
1623 * be changed later.
1624 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1625 * a single hash slot. Note: only rip_input() does the latter.
1626 * - Iterator may have optional bool matching function. The matching function
1627 * will be executed for each inpcb in the SMR context, so it can not acquire
1628 * locks and can safely access only immutable fields of inpcb.
1629 *
1630 * A fresh initialized iterator has NULL inpcb in its context and that
1631 * means that inp_next() call would return the very first inpcb on the list
1632 * locked with desired semantic. In all following calls the context pointer
1633 * shall hold the current inpcb pointer. The KPI user is not supposed to
1634 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1635 * and write NULL to its context. After end of traversal an iterator can be
1636 * reused.
1637 *
1638 * List traversals have the following features/constraints:
1639 * - New entries won't be seen, as they are always added to the head of a list.
1640 * - Removed entries won't stop traversal as long as they are not added to
1641 * a different list. This is violated by in_pcbrehash().
1642 */
1643#define II_LIST_FIRST(ipi, hash) \
1644 (((hash) == INP_ALL_LIST) ? \
1645 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
1646 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
1647#define II_LIST_NEXT(inp, hash) \
1648 (((hash) == INP_ALL_LIST) ? \
1649 CK_LIST_NEXT((inp), inp_list) : \
1650 CK_LIST_NEXT((inp), inp_hash))
1651#define II_LOCK_ASSERT(inp, lock) \
1652 rw_assert(&(inp)->inp_lock, \
1653 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
1654struct inpcb *
1656{
1657 const struct inpcbinfo *ipi = ii->ipi;
1658 inp_match_t *match = ii->match;
1659 void *ctx = ii->ctx;
1660 inp_lookup_t lock = ii->lock;
1661 int hash = ii->hash;
1662 struct inpcb *inp;
1663
1664 if (ii->inp == NULL) { /* First call. */
1665 smr_enter(ipi->ipi_smr);
1666 /* This is unrolled CK_LIST_FOREACH(). */
1667 for (inp = II_LIST_FIRST(ipi, hash);
1668 inp != NULL;
1669 inp = II_LIST_NEXT(inp, hash)) {
1670 if (match != NULL && (match)(inp, ctx) == false)
1671 continue;
1672 if (__predict_true(inp_smr_lock(inp, lock)))
1673 break;
1674 else {
1675 smr_enter(ipi->ipi_smr);
1676 MPASS(inp != II_LIST_FIRST(ipi, hash));
1677 inp = II_LIST_FIRST(ipi, hash);
1678 if (inp == NULL)
1679 break;
1680 }
1681 }
1682
1683 if (inp == NULL)
1684 smr_exit(ipi->ipi_smr);
1685 else
1686 ii->inp = inp;
1687
1688 return (inp);
1689 }
1690
1691 /* Not a first call. */
1692 smr_enter(ipi->ipi_smr);
1693restart:
1694 inp = ii->inp;
1695 II_LOCK_ASSERT(inp, lock);
1696next:
1697 inp = II_LIST_NEXT(inp, hash);
1698 if (inp == NULL) {
1699 smr_exit(ipi->ipi_smr);
1700 goto found;
1701 }
1702
1703 if (match != NULL && (match)(inp, ctx) == false)
1704 goto next;
1705
1706 if (__predict_true(inp_trylock(inp, lock))) {
1707 if (__predict_false(inp->inp_flags & INP_FREED)) {
1708 /*
1709 * Entries are never inserted in middle of a list, thus
1710 * as long as we are in SMR, we can continue traversal.
1711 * Jump to 'restart' should yield in the same result,
1712 * but could produce unnecessary looping. Could this
1713 * looping be unbound?
1714 */
1715 inp_unlock(inp, lock);
1716 goto next;
1717 } else {
1718 smr_exit(ipi->ipi_smr);
1719 goto found;
1720 }
1721 }
1722
1723 /*
1724 * Can't obtain lock immediately, thus going hard. Once we exit the
1725 * SMR section we can no longer jump to 'next', and our only stable
1726 * anchoring point is ii->inp, which we keep locked for this case, so
1727 * we jump to 'restart'.
1728 */
1729 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1730 smr_exit(ipi->ipi_smr);
1731 inp_lock(inp, lock);
1732 if (__predict_false(in_pcbrele(inp, lock))) {
1733 smr_enter(ipi->ipi_smr);
1734 goto restart;
1735 }
1736 /*
1737 * See comment in inp_smr_lock().
1738 */
1739 if (__predict_false(inp->inp_flags & INP_FREED)) {
1740 inp_unlock(inp, lock);
1741 smr_enter(ipi->ipi_smr);
1742 goto restart;
1743 }
1744 } else
1745 goto next;
1746
1747found:
1748 inp_unlock(ii->inp, lock);
1749 ii->inp = inp;
1750
1751 return (ii->inp);
1752}
1753
1754/*
1755 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1756 * stability of an inpcb pointer despite the inpcb lock being released or
1757 * SMR section exited.
1758 *
1759 * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1760 */
1761void
1762in_pcbref(struct inpcb *inp)
1763{
1764 u_int old __diagused;
1765
1766 old = refcount_acquire(&inp->inp_refcount);
1767 KASSERT(old > 0, ("%s: refcount 0", __func__));
1768}
1769
1770/*
1771 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1772 * freeing the pcb, if the reference was very last.
1773 */
1774bool
1776{
1777
1778 INP_RLOCK_ASSERT(inp);
1779
1780 if (refcount_release(&inp->inp_refcount) == 0)
1781 return (false);
1782
1783 MPASS(inp->inp_flags & INP_FREED);
1784 MPASS(inp->inp_socket == NULL);
1785 MPASS(inp->inp_in_hpts == 0);
1786 INP_RUNLOCK(inp);
1787 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1788 return (true);
1789}
1790
1791bool
1793{
1794
1795 INP_WLOCK_ASSERT(inp);
1796
1797 if (refcount_release(&inp->inp_refcount) == 0)
1798 return (false);
1799
1800 MPASS(inp->inp_flags & INP_FREED);
1801 MPASS(inp->inp_socket == NULL);
1802 MPASS(inp->inp_in_hpts == 0);
1803 INP_WUNLOCK(inp);
1804 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1805 return (true);
1806}
1807
1808/*
1809 * Unconditionally schedule an inpcb to be freed by decrementing its
1810 * reference count, which should occur only after the inpcb has been detached
1811 * from its socket. If another thread holds a temporary reference (acquired
1812 * using in_pcbref()) then the free is deferred until that reference is
1813 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1814 * Almost all work, including removal from global lists, is done in this
1815 * context, where the pcbinfo lock is held.
1816 */
1817void
1818in_pcbfree(struct inpcb *inp)
1819{
1820 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1821#ifdef INET
1822 struct ip_moptions *imo;
1823#endif
1824#ifdef INET6
1825 struct ip6_moptions *im6o;
1826#endif
1827
1828 INP_WLOCK_ASSERT(inp);
1829 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1830 KASSERT((inp->inp_flags & INP_FREED) == 0,
1831 ("%s: called twice for pcb %p", __func__, inp));
1832
1833 inp->inp_flags |= INP_FREED;
1834 INP_INFO_WLOCK(pcbinfo);
1835 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1836 pcbinfo->ipi_count--;
1837 CK_LIST_REMOVE(inp, inp_list);
1838 INP_INFO_WUNLOCK(pcbinfo);
1839
1840 if (inp->inp_flags & INP_INHASHLIST) {
1841 struct inpcbport *phd = inp->inp_phd;
1842
1843 INP_HASH_WLOCK(pcbinfo);
1844 /* XXX: Only do if SO_REUSEPORT_LB set? */
1846
1847 CK_LIST_REMOVE(inp, inp_hash);
1848 CK_LIST_REMOVE(inp, inp_portlist);
1849 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1850 CK_LIST_REMOVE(phd, phd_hash);
1851 uma_zfree_smr(pcbinfo->ipi_portzone, phd);
1852 }
1853 INP_HASH_WUNLOCK(pcbinfo);
1854 inp->inp_flags &= ~INP_INHASHLIST;
1855 }
1856
1857 RO_INVALIDATE_CACHE(&inp->inp_route);
1858#ifdef MAC
1859 mac_inpcb_destroy(inp);
1860#endif
1861#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1862 if (inp->inp_sp != NULL)
1863 ipsec_delete_pcbpolicy(inp);
1864#endif
1865#ifdef INET
1866 if (inp->inp_options)
1867 (void)m_free(inp->inp_options);
1868 imo = inp->inp_moptions;
1869#endif
1870#ifdef INET6
1871 if (inp->inp_vflag & INP_IPV6PROTO) {
1872 ip6_freepcbopts(inp->in6p_outputopts);
1873 im6o = inp->in6p_moptions;
1874 } else
1875 im6o = NULL;
1876#endif
1877
1878 if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1879 INP_WUNLOCK(inp);
1880 }
1881#ifdef INET6
1882 ip6_freemoptions(im6o);
1883#endif
1884#ifdef INET
1885 inp_freemoptions(imo);
1886#endif
1887 /* Destruction is finalized in inpcb_dtor(). */
1888}
1889
1890static void
1891inpcb_dtor(void *mem, int size, void *arg)
1892{
1893 struct inpcb *inp = mem;
1894
1895 crfree(inp->inp_cred);
1896#ifdef INVARIANTS
1897 inp->inp_cred = NULL;
1898#endif
1899}
1900
1901/*
1902 * Different protocols initialize their inpcbs differently - giving
1903 * different name to the lock. But they all are disposed the same.
1904 */
1905static void
1906inpcb_fini(void *mem, int size)
1907{
1908 struct inpcb *inp = mem;
1909
1910 INP_LOCK_DESTROY(inp);
1911}
1912
1913/*
1914 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1915 * port reservation, and preventing it from being returned by inpcb lookups.
1916 *
1917 * It is used by TCP to mark an inpcb as unused and avoid future packet
1918 * delivery or event notification when a socket remains open but TCP has
1919 * closed. This might occur as a result of a shutdown()-initiated TCP close
1920 * or a RST on the wire, and allows the port binding to be reused while still
1921 * maintaining the invariant that so_pcb always points to a valid inpcb until
1922 * in_pcbdetach().
1923 *
1924 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1925 * in_pcbnotifyall() and in_pcbpurgeif0()?
1926 */
1927void
1928in_pcbdrop(struct inpcb *inp)
1929{
1930
1931 INP_WLOCK_ASSERT(inp);
1932#ifdef INVARIANTS
1933 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1934 MPASS(inp->inp_refcount > 1);
1935#endif
1936
1937 /*
1938 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1939 * the hash lock...?
1940 */
1941 inp->inp_flags |= INP_DROPPED;
1942 if (inp->inp_flags & INP_INHASHLIST) {
1943 struct inpcbport *phd = inp->inp_phd;
1944
1947 CK_LIST_REMOVE(inp, inp_hash);
1948 CK_LIST_REMOVE(inp, inp_portlist);
1949 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1950 CK_LIST_REMOVE(phd, phd_hash);
1951 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
1952 }
1954 inp->inp_flags &= ~INP_INHASHLIST;
1955 }
1956}
1957
1958#ifdef INET
1959/*
1960 * Common routines to return the socket addresses associated with inpcbs.
1961 */
1962struct sockaddr *
1963in_sockaddr(in_port_t port, struct in_addr *addr_p)
1964{
1965 struct sockaddr_in *sin;
1966
1967 sin = malloc(sizeof *sin, M_SONAME,
1968 M_WAITOK | M_ZERO);
1969 sin->sin_family = AF_INET;
1970 sin->sin_len = sizeof(*sin);
1971 sin->sin_addr = *addr_p;
1972 sin->sin_port = port;
1973
1974 return (struct sockaddr *)sin;
1975}
1976
1977int
1978in_getsockaddr(struct socket *so, struct sockaddr **nam)
1979{
1980 struct inpcb *inp;
1981 struct in_addr addr;
1982 in_port_t port;
1983
1984 inp = sotoinpcb(so);
1985 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1986
1987 INP_RLOCK(inp);
1988 port = inp->inp_lport;
1989 addr = inp->inp_laddr;
1990 INP_RUNLOCK(inp);
1991
1992 *nam = in_sockaddr(port, &addr);
1993 return 0;
1994}
1995
1996int
1997in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1998{
1999 struct inpcb *inp;
2000 struct in_addr addr;
2001 in_port_t port;
2002
2003 inp = sotoinpcb(so);
2004 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
2005
2006 INP_RLOCK(inp);
2007 port = inp->inp_fport;
2008 addr = inp->inp_faddr;
2009 INP_RUNLOCK(inp);
2010
2011 *nam = in_sockaddr(port, &addr);
2012 return 0;
2013}
2014
2015void
2016in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
2017 struct inpcb *(*notify)(struct inpcb *, int))
2018{
2019 struct inpcb *inp, *inp_temp;
2020
2021 INP_INFO_WLOCK(pcbinfo);
2022 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
2023 INP_WLOCK(inp);
2024#ifdef INET6
2025 if ((inp->inp_vflag & INP_IPV4) == 0) {
2026 INP_WUNLOCK(inp);
2027 continue;
2028 }
2029#endif
2030 if (inp->inp_faddr.s_addr != faddr.s_addr ||
2031 inp->inp_socket == NULL) {
2032 INP_WUNLOCK(inp);
2033 continue;
2034 }
2035 if ((*notify)(inp, errno))
2036 INP_WUNLOCK(inp);
2037 }
2038 INP_INFO_WUNLOCK(pcbinfo);
2039}
2040
2041static bool
2042inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
2043{
2044
2045 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
2046 return (true);
2047 else
2048 return (false);
2049}
2050
2051void
2052in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
2053{
2054 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
2055 inp_v4_multi_match, NULL);
2056 struct inpcb *inp;
2057 struct in_multi *inm;
2058 struct in_mfilter *imf;
2059 struct ip_moptions *imo;
2060
2062
2063 while ((inp = inp_next(&inpi)) != NULL) {
2064 INP_WLOCK_ASSERT(inp);
2065
2066 imo = inp->inp_moptions;
2067 /*
2068 * Unselect the outgoing interface if it is being
2069 * detached.
2070 */
2071 if (imo->imo_multicast_ifp == ifp)
2072 imo->imo_multicast_ifp = NULL;
2073
2074 /*
2075 * Drop multicast group membership if we joined
2076 * through the interface being detached.
2077 *
2078 * XXX This can all be deferred to an epoch_call
2079 */
2080restart:
2081 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
2082 if ((inm = imf->imf_inm) == NULL)
2083 continue;
2084 if (inm->inm_ifp != ifp)
2085 continue;
2086 ip_mfilter_remove(&imo->imo_head, imf);
2087 in_leavegroup_locked(inm, NULL);
2088 ip_mfilter_free(imf);
2089 goto restart;
2090 }
2091 }
2092}
2093
2094/*
2095 * Lookup a PCB based on the local address and port. Caller must hold the
2096 * hash lock. No inpcb locks or references are acquired.
2097 */
2098#define INP_LOOKUP_MAPPED_PCB_COST 3
2099struct inpcb *
2100in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2101 u_short lport, int lookupflags, struct ucred *cred)
2102{
2103 struct inpcb *inp;
2104#ifdef INET6
2105 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2106#else
2107 int matchwild = 3;
2108#endif
2109 int wildcard;
2110
2111 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2112 ("%s: invalid lookup flags %d", __func__, lookupflags));
2113 INP_HASH_LOCK_ASSERT(pcbinfo);
2114
2115 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2116 struct inpcbhead *head;
2117 /*
2118 * Look for an unconnected (wildcard foreign addr) PCB that
2119 * matches the local address and port we're looking for.
2120 */
2121 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
2122 pcbinfo->ipi_hashmask)];
2123 CK_LIST_FOREACH(inp, head, inp_hash) {
2124#ifdef INET6
2125 /* XXX inp locking */
2126 if ((inp->inp_vflag & INP_IPV4) == 0)
2127 continue;
2128#endif
2129 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2130 inp->inp_laddr.s_addr == laddr.s_addr &&
2131 inp->inp_lport == lport) {
2132 /*
2133 * Found?
2134 */
2135 if (cred == NULL ||
2136 prison_equal_ip4(cred->cr_prison,
2137 inp->inp_cred->cr_prison))
2138 return (inp);
2139 }
2140 }
2141 /*
2142 * Not found.
2143 */
2144 return (NULL);
2145 } else {
2146 struct inpcbporthead *porthash;
2147 struct inpcbport *phd;
2148 struct inpcb *match = NULL;
2149 /*
2150 * Best fit PCB lookup.
2151 *
2152 * First see if this local port is in use by looking on the
2153 * port hash list.
2154 */
2155 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2156 pcbinfo->ipi_porthashmask)];
2157 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2158 if (phd->phd_port == lport)
2159 break;
2160 }
2161 if (phd != NULL) {
2162 /*
2163 * Port is in use by one or more PCBs. Look for best
2164 * fit.
2165 */
2166 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2167 wildcard = 0;
2168 if (cred != NULL &&
2169 !prison_equal_ip4(inp->inp_cred->cr_prison,
2170 cred->cr_prison))
2171 continue;
2172#ifdef INET6
2173 /* XXX inp locking */
2174 if ((inp->inp_vflag & INP_IPV4) == 0)
2175 continue;
2176 /*
2177 * We never select the PCB that has
2178 * INP_IPV6 flag and is bound to :: if
2179 * we have another PCB which is bound
2180 * to 0.0.0.0. If a PCB has the
2181 * INP_IPV6 flag, then we set its cost
2182 * higher than IPv4 only PCBs.
2183 *
2184 * Note that the case only happens
2185 * when a socket is bound to ::, under
2186 * the condition that the use of the
2187 * mapped address is allowed.
2188 */
2189 if ((inp->inp_vflag & INP_IPV6) != 0)
2190 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2191#endif
2192 if (inp->inp_faddr.s_addr != INADDR_ANY)
2193 wildcard++;
2194 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2195 if (laddr.s_addr == INADDR_ANY)
2196 wildcard++;
2197 else if (inp->inp_laddr.s_addr != laddr.s_addr)
2198 continue;
2199 } else {
2200 if (laddr.s_addr != INADDR_ANY)
2201 wildcard++;
2202 }
2203 if (wildcard < matchwild) {
2204 match = inp;
2205 matchwild = wildcard;
2206 if (matchwild == 0)
2207 break;
2208 }
2209 }
2210 }
2211 return (match);
2212 }
2213}
2214#undef INP_LOOKUP_MAPPED_PCB_COST
2215
2216static struct inpcb *
2217in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2218 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2219 uint16_t fport, int lookupflags, int numa_domain)
2220{
2221 struct inpcb *local_wild, *numa_wild;
2222 const struct inpcblbgrouphead *hdr;
2223 struct inpcblbgroup *grp;
2224 uint32_t idx;
2225
2226 INP_HASH_LOCK_ASSERT(pcbinfo);
2227
2228 hdr = &pcbinfo->ipi_lbgrouphashbase[
2229 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2230
2231 /*
2232 * Order of socket selection:
2233 * 1. non-wild.
2234 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2235 *
2236 * NOTE:
2237 * - Load balanced group does not contain jailed sockets
2238 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2239 */
2240 local_wild = NULL;
2241 numa_wild = NULL;
2242 CK_LIST_FOREACH(grp, hdr, il_list) {
2243#ifdef INET6
2244 if (!(grp->il_vflag & INP_IPV4))
2245 continue;
2246#endif
2247 if (grp->il_lport != lport)
2248 continue;
2249
2250 idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
2251 grp->il_inpcnt;
2252 if (grp->il_laddr.s_addr == laddr->s_addr) {
2253 if (numa_domain == M_NODOM ||
2254 grp->il_numa_domain == numa_domain) {
2255 return (grp->il_inp[idx]);
2256 } else {
2257 numa_wild = grp->il_inp[idx];
2258 }
2259 }
2260 if (grp->il_laddr.s_addr == INADDR_ANY &&
2261 (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
2262 (local_wild == NULL || numa_domain == M_NODOM ||
2263 grp->il_numa_domain == numa_domain)) {
2264 local_wild = grp->il_inp[idx];
2265 }
2266 }
2267 if (numa_wild != NULL)
2268 return (numa_wild);
2269
2270 return (local_wild);
2271}
2272
2273/*
2274 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2275 * that the caller has either locked the hash list, which usually happens
2276 * for bind(2) operations, or is in SMR section, which happens when sorting
2277 * out incoming packets.
2278 */
2279static struct inpcb *
2280in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2281 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2282 struct ifnet *ifp, uint8_t numa_domain)
2283{
2284 struct inpcbhead *head;
2285 struct inpcb *inp, *tmpinp;
2286 u_short fport = fport_arg, lport = lport_arg;
2287
2288 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2289 ("%s: invalid lookup flags %d", __func__, lookupflags));
2290 INP_HASH_LOCK_ASSERT(pcbinfo);
2291
2292 /*
2293 * First look for an exact match.
2294 */
2295 tmpinp = NULL;
2296 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
2297 pcbinfo->ipi_hashmask)];
2298 CK_LIST_FOREACH(inp, head, inp_hash) {
2299#ifdef INET6
2300 /* XXX inp locking */
2301 if ((inp->inp_vflag & INP_IPV4) == 0)
2302 continue;
2303#endif
2304 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2305 inp->inp_laddr.s_addr == laddr.s_addr &&
2306 inp->inp_fport == fport &&
2307 inp->inp_lport == lport) {
2308 /*
2309 * XXX We should be able to directly return
2310 * the inp here, without any checks.
2311 * Well unless both bound with SO_REUSEPORT?
2312 */
2313 if (prison_flag(inp->inp_cred, PR_IP4))
2314 return (inp);
2315 if (tmpinp == NULL)
2316 tmpinp = inp;
2317 }
2318 }
2319 if (tmpinp != NULL)
2320 return (tmpinp);
2321
2322 /*
2323 * Then look in lb group (for wildcard match).
2324 */
2325 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2326 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2327 fport, lookupflags, numa_domain);
2328 if (inp != NULL)
2329 return (inp);
2330 }
2331
2332 /*
2333 * Then look for a wildcard match, if requested.
2334 */
2335 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2336 struct inpcb *local_wild = NULL, *local_exact = NULL;
2337#ifdef INET6
2338 struct inpcb *local_wild_mapped = NULL;
2339#endif
2340 struct inpcb *jail_wild = NULL;
2341 int injail;
2342
2343 /*
2344 * Order of socket selection - we always prefer jails.
2345 * 1. jailed, non-wild.
2346 * 2. jailed, wild.
2347 * 3. non-jailed, non-wild.
2348 * 4. non-jailed, wild.
2349 */
2350
2351 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
2352 pcbinfo->ipi_hashmask)];
2353 CK_LIST_FOREACH(inp, head, inp_hash) {
2354#ifdef INET6
2355 /* XXX inp locking */
2356 if ((inp->inp_vflag & INP_IPV4) == 0)
2357 continue;
2358#endif
2359 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2360 inp->inp_lport != lport)
2361 continue;
2362
2363 injail = prison_flag(inp->inp_cred, PR_IP4);
2364 if (injail) {
2366 inp->inp_cred->cr_prison, &laddr) != 0)
2367 continue;
2368 } else {
2369 if (local_exact != NULL)
2370 continue;
2371 }
2372
2373 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2374 if (injail)
2375 return (inp);
2376 else
2377 local_exact = inp;
2378 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2379#ifdef INET6
2380 /* XXX inp locking, NULL check */
2381 if (inp->inp_vflag & INP_IPV6PROTO)
2382 local_wild_mapped = inp;
2383 else
2384#endif
2385 if (injail)
2386 jail_wild = inp;
2387 else
2388 local_wild = inp;
2389 }
2390 } /* LIST_FOREACH */
2391 if (jail_wild != NULL)
2392 return (jail_wild);
2393 if (local_exact != NULL)
2394 return (local_exact);
2395 if (local_wild != NULL)
2396 return (local_wild);
2397#ifdef INET6
2398 if (local_wild_mapped != NULL)
2399 return (local_wild_mapped);
2400#endif
2401 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2402
2403 return (NULL);
2404}
2405
2406/*
2407 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the
2408 * hash list lock, and will return the inpcb locked (i.e., requires
2409 * INPLOOKUP_LOCKPCB).
2410 */
2411static struct inpcb *
2412in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2413 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2414 struct ifnet *ifp, uint8_t numa_domain)
2415{
2416 struct inpcb *inp;
2417
2418 smr_enter(pcbinfo->ipi_smr);
2419 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2420 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
2421 if (inp != NULL) {
2422 if (__predict_false(inp_smr_lock(inp,
2423 (lookupflags & INPLOOKUP_LOCKMASK)) == false))
2424 inp = NULL;
2425 } else
2426 smr_exit(pcbinfo->ipi_smr);
2427
2428 return (inp);
2429}
2430
2431/*
2432 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2433 * from which a pre-calculated hash value may be extracted.
2434 */
2435struct inpcb *
2436in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2437 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2438{
2439
2440 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2441 ("%s: invalid lookup flags %d", __func__, lookupflags));
2442 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2443 ("%s: LOCKPCB not set", __func__));
2444
2445 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2446 lookupflags, ifp, M_NODOM));
2447}
2448
2449struct inpcb *
2450in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2451 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2452 struct ifnet *ifp, struct mbuf *m)
2453{
2454
2455 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2456 ("%s: invalid lookup flags %d", __func__, lookupflags));
2457 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2458 ("%s: LOCKPCB not set", __func__));
2459
2460 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2461 lookupflags, ifp, m->m_pkthdr.numa_domain));
2462}
2463#endif /* INET */
2464
2465/*
2466 * Insert PCB onto various hash lists.
2467 */
2468int
2470{
2471 struct inpcbhead *pcbhash;
2472 struct inpcbporthead *pcbporthash;
2473 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2474 struct inpcbport *phd;
2475 int so_options;
2476
2477 INP_WLOCK_ASSERT(inp);
2478 INP_HASH_WLOCK_ASSERT(pcbinfo);
2479
2480 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2481 ("in_pcbinshash: INP_INHASHLIST"));
2482
2483#ifdef INET6
2484 if (inp->inp_vflag & INP_IPV6)
2485 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
2486 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2487 else
2488#endif
2489 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
2490 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2491
2492 pcbporthash = &pcbinfo->ipi_porthashbase[
2493 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2494
2495 /*
2496 * Add entry to load balance group.
2497 * Only do this if SO_REUSEPORT_LB is set.
2498 */
2499 so_options = inp_so_options(inp);
2500 if (so_options & SO_REUSEPORT_LB) {
2501 int ret = in_pcbinslbgrouphash(inp, M_NODOM);
2502 if (ret) {
2503 /* pcb lb group malloc fail (ret=ENOBUFS). */
2504 return (ret);
2505 }
2506 }
2507
2508 /*
2509 * Go through port list and look for a head for this lport.
2510 */
2511 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2512 if (phd->phd_port == inp->inp_lport)
2513 break;
2514 }
2515 /*
2516 * If none exists, malloc one and tack it on.
2517 */
2518 if (phd == NULL) {
2519 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
2520 if (phd == NULL) {
2521 return (ENOBUFS); /* XXX */
2522 }
2523 phd->phd_port = inp->inp_lport;
2524 CK_LIST_INIT(&phd->phd_pcblist);
2525 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2526 }
2527 inp->inp_phd = phd;
2528 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2529 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2530 inp->inp_flags |= INP_INHASHLIST;
2531
2532 return (0);
2533}
2534
2535/*
2536 * Move PCB to the proper hash bucket when { faddr, fport } have been
2537 * changed. NOTE: This does not handle the case of the lport changing (the
2538 * hashed port list would have to be updated as well), so the lport must
2539 * not change after in_pcbinshash() has been called.
2540 *
2541 * XXXGL: a race between this function and SMR-protected hash iterator
2542 * will lead to iterator traversing a possibly wrong hash list. However,
2543 * this race should have been here since change from rwlock to epoch.
2544 */
2545void
2547{
2548 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2549 struct inpcbhead *head;
2550
2551 INP_WLOCK_ASSERT(inp);
2552 INP_HASH_WLOCK_ASSERT(pcbinfo);
2553
2554 KASSERT(inp->inp_flags & INP_INHASHLIST,
2555 ("in_pcbrehash: !INP_INHASHLIST"));
2556
2557#ifdef INET6
2558 if (inp->inp_vflag & INP_IPV6)
2559 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
2560 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2561 else
2562#endif
2563 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
2564 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2565
2566 CK_LIST_REMOVE(inp, inp_hash);
2567 CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2568}
2569
2570/*
2571 * Check for alternatives when higher level complains
2572 * about service problems. For now, invalidate cached
2573 * routing information. If the route was created dynamically
2574 * (by a redirect), time to try a default gateway again.
2575 */
2576void
2577in_losing(struct inpcb *inp)
2578{
2579
2580 RO_INVALIDATE_CACHE(&inp->inp_route);
2581 return;
2582}
2583
2584/*
2585 * A set label operation has occurred at the socket layer, propagate the
2586 * label change into the in_pcb for the socket.
2587 */
2588void
2589in_pcbsosetlabel(struct socket *so)
2590{
2591#ifdef MAC
2592 struct inpcb *inp;
2593
2594 inp = sotoinpcb(so);
2595 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2596
2597 INP_WLOCK(inp);
2598 SOCK_LOCK(so);
2599 mac_inpcb_sosetlabel(so, inp);
2600 SOCK_UNLOCK(so);
2601 INP_WUNLOCK(inp);
2602#endif
2603}
2604
2605/*
2606 * ipport_tick runs once per second, determining if random port allocation
2607 * should be continued. If more than ipport_randomcps ports have been
2608 * allocated in the last second, then we return to sequential port
2609 * allocation. We return to random allocation only once we drop below
2610 * ipport_randomcps for at least ipport_randomtime seconds.
2611 */
2612static void
2613ipport_tick(void *xtp)
2614{
2615 VNET_ITERATOR_DECL(vnet_iter);
2616
2617 VNET_LIST_RLOCK_NOSLEEP();
2618 VNET_FOREACH(vnet_iter) {
2619 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2622 if (V_ipport_stoprandom > 0)
2624 } else
2627 CURVNET_RESTORE();
2628 }
2629 VNET_LIST_RUNLOCK_NOSLEEP();
2630 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2631}
2632
2633static void
2634ip_fini(void *xtp)
2635{
2636
2637 callout_stop(&ipport_tick_callout);
2638}
2639
2640/*
2641 * The ipport_callout should start running at about the time we attach the
2642 * inet or inet6 domains.
2643 */
2644static void
2645ipport_tick_init(const void *unused __unused)
2646{
2647
2648 /* Start ipport_tick. */
2649 callout_init(&ipport_tick_callout, 1);
2650 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2651 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2652 SHUTDOWN_PRI_DEFAULT);
2653}
2654SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2655 ipport_tick_init, NULL);
2656
2657void
2658inp_wlock(struct inpcb *inp)
2659{
2660
2661 INP_WLOCK(inp);
2662}
2663
2664void
2665inp_wunlock(struct inpcb *inp)
2666{
2667
2668 INP_WUNLOCK(inp);
2669}
2670
2671void
2672inp_rlock(struct inpcb *inp)
2673{
2674
2675 INP_RLOCK(inp);
2676}
2677
2678void
2679inp_runlock(struct inpcb *inp)
2680{
2681
2682 INP_RUNLOCK(inp);
2683}
2684
2685#ifdef INVARIANT_SUPPORT
2686void
2687inp_lock_assert(struct inpcb *inp)
2688{
2689
2690 INP_WLOCK_ASSERT(inp);
2691}
2692
2693void
2694inp_unlock_assert(struct inpcb *inp)
2695{
2696
2697 INP_UNLOCK_ASSERT(inp);
2698}
2699#endif
2700
2701void
2702inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2703{
2706 struct inpcb *inp;
2707
2708 while ((inp = inp_next(&inpi)) != NULL)
2709 func(inp, arg);
2710}
2711
2712struct socket *
2714{
2715
2716 INP_WLOCK_ASSERT(inp);
2717 return (inp->inp_socket);
2718}
2719
2720struct tcpcb *
2722{
2723
2724 INP_WLOCK_ASSERT(inp);
2725 return ((struct tcpcb *)inp->inp_ppcb);
2726}
2727
2728int
2729inp_ip_tos_get(const struct inpcb *inp)
2730{
2731
2732 return (inp->inp_ip_tos);
2733}
2734
2735void
2736inp_ip_tos_set(struct inpcb *inp, int val)
2737{
2738
2739 inp->inp_ip_tos = val;
2740}
2741
2742void
2743inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2744 uint32_t *faddr, uint16_t *fp)
2745{
2746
2747 INP_LOCK_ASSERT(inp);
2748 *laddr = inp->inp_laddr.s_addr;
2749 *faddr = inp->inp_faddr.s_addr;
2750 *lp = inp->inp_lport;
2751 *fp = inp->inp_fport;
2752}
2753
2754struct inpcb *
2755so_sotoinpcb(struct socket *so)
2756{
2757
2758 return (sotoinpcb(so));
2759}
2760
2761struct tcpcb *
2762so_sototcpcb(struct socket *so)
2763{
2764
2765 return (sototcpcb(so));
2766}
2767
2768/*
2769 * Create an external-format (``xinpcb'') structure using the information in
2770 * the kernel-format in_pcb structure pointed to by inp. This is done to
2771 * reduce the spew of irrelevant information over this interface, to isolate
2772 * user code from changes in the kernel structure, and potentially to provide
2773 * information-hiding if we decide that some of this information should be
2774 * hidden from users.
2775 */
2776void
2777in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2778{
2779
2780 bzero(xi, sizeof(*xi));
2781 xi->xi_len = sizeof(struct xinpcb);
2782 if (inp->inp_socket)
2783 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2784 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2785 xi->inp_gencnt = inp->inp_gencnt;
2786 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
2787 xi->inp_flow = inp->inp_flow;
2788 xi->inp_flowid = inp->inp_flowid;
2789 xi->inp_flowtype = inp->inp_flowtype;
2790 xi->inp_flags = inp->inp_flags;
2791 xi->inp_flags2 = inp->inp_flags2;
2792 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
2793 xi->in6p_cksum = inp->in6p_cksum;
2794 xi->in6p_hops = inp->in6p_hops;
2795 xi->inp_ip_tos = inp->inp_ip_tos;
2796 xi->inp_vflag = inp->inp_vflag;
2797 xi->inp_ip_ttl = inp->inp_ip_ttl;
2798 xi->inp_ip_p = inp->inp_ip_p;
2799 xi->inp_ip_minttl = inp->inp_ip_minttl;
2800}
2801
2802int
2803sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2804 int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2805{
2806 struct sockopt sopt;
2807 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2809 struct inpcb *inp;
2810 struct sockopt_parameters *params;
2811 struct socket *so;
2812 int error;
2813 char buf[1024];
2814
2815 if (req->oldptr != NULL || req->oldlen != 0)
2816 return (EINVAL);
2817 if (req->newptr == NULL)
2818 return (EPERM);
2819 if (req->newlen > sizeof(buf))
2820 return (ENOMEM);
2821 error = SYSCTL_IN(req, buf, req->newlen);
2822 if (error != 0)
2823 return (error);
2824 if (req->newlen < sizeof(struct sockopt_parameters))
2825 return (EINVAL);
2826 params = (struct sockopt_parameters *)buf;
2827 sopt.sopt_level = params->sop_level;
2828 sopt.sopt_name = params->sop_optname;
2829 sopt.sopt_dir = SOPT_SET;
2830 sopt.sopt_val = params->sop_optval;
2831 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2832 sopt.sopt_td = NULL;
2833#ifdef INET6
2834 if (params->sop_inc.inc_flags & INC_ISIPV6) {
2835 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2836 params->sop_inc.inc6_laddr.s6_addr16[1] =
2837 htons(params->sop_inc.inc6_zoneid & 0xffff);
2838 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2839 params->sop_inc.inc6_faddr.s6_addr16[1] =
2840 htons(params->sop_inc.inc6_zoneid & 0xffff);
2841 }
2842#endif
2843 if (params->sop_inc.inc_lport != htons(0)) {
2844 if (params->sop_inc.inc_fport == htons(0))
2845 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport,
2846 pcbinfo->ipi_hashmask);
2847 else
2848#ifdef INET6
2849 if (params->sop_inc.inc_flags & INC_ISIPV6)
2850 inpi.hash = INP6_PCBHASH(
2851 &params->sop_inc.inc6_faddr,
2852 params->sop_inc.inc_lport,
2853 params->sop_inc.inc_fport,
2854 pcbinfo->ipi_hashmask);
2855 else
2856#endif
2857 inpi.hash = INP_PCBHASH(
2858 &params->sop_inc.inc_faddr,
2859 params->sop_inc.inc_lport,
2860 params->sop_inc.inc_fport,
2861 pcbinfo->ipi_hashmask);
2862 }
2863 while ((inp = inp_next(&inpi)) != NULL)
2864 if (inp->inp_gencnt == params->sop_id) {
2865 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2866 INP_WUNLOCK(inp);
2867 return (ECONNRESET);
2868 }
2869 so = inp->inp_socket;
2870 KASSERT(so != NULL, ("inp_socket == NULL"));
2871 soref(so);
2872 error = (*ctloutput_set)(inp, &sopt);
2873 sorele(so);
2874 break;
2875 }
2876 if (inp == NULL)
2877 error = ESRCH;
2878 return (error);
2879}
2880
2881#ifdef DDB
2882static void
2883db_print_indent(int indent)
2884{
2885 int i;
2886
2887 for (i = 0; i < indent; i++)
2888 db_printf(" ");
2889}
2890
2891static void
2892db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2893{
2894 char faddr_str[48], laddr_str[48];
2895
2896 db_print_indent(indent);
2897 db_printf("%s at %p\n", name, inc);
2898
2899 indent += 2;
2900
2901#ifdef INET6
2902 if (inc->inc_flags & INC_ISIPV6) {
2903 /* IPv6. */
2904 ip6_sprintf(laddr_str, &inc->inc6_laddr);
2905 ip6_sprintf(faddr_str, &inc->inc6_faddr);
2906 } else
2907#endif
2908 {
2909 /* IPv4. */
2910 inet_ntoa_r(inc->inc_laddr, laddr_str);
2911 inet_ntoa_r(inc->inc_faddr, faddr_str);
2912 }
2913 db_print_indent(indent);
2914 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
2915 ntohs(inc->inc_lport));
2916 db_print_indent(indent);
2917 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
2918 ntohs(inc->inc_fport));
2919}
2920
2921static void
2922db_print_inpflags(int inp_flags)
2923{
2924 int comma;
2925
2926 comma = 0;
2927 if (inp_flags & INP_RECVOPTS) {
2928 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2929 comma = 1;
2930 }
2931 if (inp_flags & INP_RECVRETOPTS) {
2932 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2933 comma = 1;
2934 }
2935 if (inp_flags & INP_RECVDSTADDR) {
2936 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2937 comma = 1;
2938 }
2939 if (inp_flags & INP_ORIGDSTADDR) {
2940 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
2941 comma = 1;
2942 }
2943 if (inp_flags & INP_HDRINCL) {
2944 db_printf("%sINP_HDRINCL", comma ? ", " : "");
2945 comma = 1;
2946 }
2947 if (inp_flags & INP_HIGHPORT) {
2948 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2949 comma = 1;
2950 }
2951 if (inp_flags & INP_LOWPORT) {
2952 db_printf("%sINP_LOWPORT", comma ? ", " : "");
2953 comma = 1;
2954 }
2955 if (inp_flags & INP_ANONPORT) {
2956 db_printf("%sINP_ANONPORT", comma ? ", " : "");
2957 comma = 1;
2958 }
2959 if (inp_flags & INP_RECVIF) {
2960 db_printf("%sINP_RECVIF", comma ? ", " : "");
2961 comma = 1;
2962 }
2963 if (inp_flags & INP_MTUDISC) {
2964 db_printf("%sINP_MTUDISC", comma ? ", " : "");
2965 comma = 1;
2966 }
2967 if (inp_flags & INP_RECVTTL) {
2968 db_printf("%sINP_RECVTTL", comma ? ", " : "");
2969 comma = 1;
2970 }
2971 if (inp_flags & INP_DONTFRAG) {
2972 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2973 comma = 1;
2974 }
2975 if (inp_flags & INP_RECVTOS) {
2976 db_printf("%sINP_RECVTOS", comma ? ", " : "");
2977 comma = 1;
2978 }
2979 if (inp_flags & IN6P_IPV6_V6ONLY) {
2980 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2981 comma = 1;
2982 }
2983 if (inp_flags & IN6P_PKTINFO) {
2984 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2985 comma = 1;
2986 }
2987 if (inp_flags & IN6P_HOPLIMIT) {
2988 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2989 comma = 1;
2990 }
2991 if (inp_flags & IN6P_HOPOPTS) {
2992 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
2993 comma = 1;
2994 }
2995 if (inp_flags & IN6P_DSTOPTS) {
2996 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
2997 comma = 1;
2998 }
2999 if (inp_flags & IN6P_RTHDR) {
3000 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3001 comma = 1;
3002 }
3003 if (inp_flags & IN6P_RTHDRDSTOPTS) {
3004 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3005 comma = 1;
3006 }
3007 if (inp_flags & IN6P_TCLASS) {
3008 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3009 comma = 1;
3010 }
3011 if (inp_flags & IN6P_AUTOFLOWLABEL) {
3012 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3013 comma = 1;
3014 }
3015 if (inp_flags & INP_TIMEWAIT) {
3016 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3017 comma = 1;
3018 }
3019 if (inp_flags & INP_ONESBCAST) {
3020 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3021 comma = 1;
3022 }
3023 if (inp_flags & INP_DROPPED) {
3024 db_printf("%sINP_DROPPED", comma ? ", " : "");
3025 comma = 1;
3026 }
3027 if (inp_flags & INP_SOCKREF) {
3028 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3029 comma = 1;
3030 }
3031 if (inp_flags & IN6P_RFC2292) {
3032 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3033 comma = 1;
3034 }
3035 if (inp_flags & IN6P_MTU) {
3036 db_printf("IN6P_MTU%s", comma ? ", " : "");
3037 comma = 1;
3038 }
3039}
3040
3041static void
3042db_print_inpvflag(u_char inp_vflag)
3043{
3044 int comma;
3045
3046 comma = 0;
3047 if (inp_vflag & INP_IPV4) {
3048 db_printf("%sINP_IPV4", comma ? ", " : "");
3049 comma = 1;
3050 }
3051 if (inp_vflag & INP_IPV6) {
3052 db_printf("%sINP_IPV6", comma ? ", " : "");
3053 comma = 1;
3054 }
3055 if (inp_vflag & INP_IPV6PROTO) {
3056 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3057 comma = 1;
3058 }
3059}
3060
3061static void
3062db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3063{
3064
3065 db_print_indent(indent);
3066 db_printf("%s at %p\n", name, inp);
3067
3068 indent += 2;
3069
3070 db_print_indent(indent);
3071 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3072
3073 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3074
3075 db_print_indent(indent);
3076 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
3077 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3078
3079 db_print_indent(indent);
3080 db_printf("inp_label: %p inp_flags: 0x%x (",
3081 inp->inp_label, inp->inp_flags);
3082 db_print_inpflags(inp->inp_flags);
3083 db_printf(")\n");
3084
3085 db_print_indent(indent);
3086 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
3087 inp->inp_vflag);
3088 db_print_inpvflag(inp->inp_vflag);
3089 db_printf(")\n");
3090
3091 db_print_indent(indent);
3092 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3093 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3094
3095 db_print_indent(indent);
3096#ifdef INET6
3097 if (inp->inp_vflag & INP_IPV6) {
3098 db_printf("in6p_options: %p in6p_outputopts: %p "
3099 "in6p_moptions: %p\n", inp->in6p_options,
3100 inp->in6p_outputopts, inp->in6p_moptions);
3101 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3102 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3103 inp->in6p_hops);
3104 } else
3105#endif
3106 {
3107 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3108 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3109 inp->inp_options, inp->inp_moptions);
3110 }
3111
3112 db_print_indent(indent);
3113 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
3114 (uintmax_t)inp->inp_gencnt);
3115}
3116
3117DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3118{
3119 struct inpcb *inp;
3120
3121 if (!have_addr) {
3122 db_printf("usage: show inpcb <addr>\n");
3123 return;
3124 }
3125 inp = (struct inpcb *)addr;
3126
3127 db_print_inpcb(inp, "inpcb", 0);
3128}
3129#endif /* DDB */
3130
3131#ifdef RATELIMIT
3132/*
3133 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3134 * if any.
3135 */
3136int
3137in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3138{
3139 union if_snd_tag_modify_params params = {
3140 .rate_limit.max_rate = max_pacing_rate,
3141 .rate_limit.flags = M_NOWAIT,
3142 };
3143 struct m_snd_tag *mst;
3144 int error;
3145
3146 mst = inp->inp_snd_tag;
3147 if (mst == NULL)
3148 return (EINVAL);
3149
3150 if (mst->sw->snd_tag_modify == NULL) {
3151 error = EOPNOTSUPP;
3152 } else {
3153 error = mst->sw->snd_tag_modify(mst, &params);
3154 }
3155 return (error);
3156}
3157
3158/*
3159 * Query existing TX rate limit based on the existing
3160 * "inp->inp_snd_tag", if any.
3161 */
3162int
3163in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3164{
3165 union if_snd_tag_query_params params = { };
3166 struct m_snd_tag *mst;
3167 int error;
3168
3169 mst = inp->inp_snd_tag;
3170 if (mst == NULL)
3171 return (EINVAL);
3172
3173 if (mst->sw->snd_tag_query == NULL) {
3174 error = EOPNOTSUPP;
3175 } else {
3176 error = mst->sw->snd_tag_query(mst, &params);
3177 if (error == 0 && p_max_pacing_rate != NULL)
3178 *p_max_pacing_rate = params.rate_limit.max_rate;
3179 }
3180 return (error);
3181}
3182
3183/*
3184 * Query existing TX queue level based on the existing
3185 * "inp->inp_snd_tag", if any.
3186 */
3187int
3188in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3189{
3190 union if_snd_tag_query_params params = { };
3191 struct m_snd_tag *mst;
3192 int error;
3193
3194 mst = inp->inp_snd_tag;
3195 if (mst == NULL)
3196 return (EINVAL);
3197
3198 if (mst->sw->snd_tag_query == NULL)
3199 return (EOPNOTSUPP);
3200
3201 error = mst->sw->snd_tag_query(mst, &params);
3202 if (error == 0 && p_txqueue_level != NULL)
3203 *p_txqueue_level = params.rate_limit.queue_level;
3204 return (error);
3205}
3206
3207/*
3208 * Allocate a new TX rate limit send tag from the network interface
3209 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3210 */
3211int
3212in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3213 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3214
3215{
3216 union if_snd_tag_alloc_params params = {
3217 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3218 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3219 .rate_limit.hdr.flowid = flowid,
3220 .rate_limit.hdr.flowtype = flowtype,
3221 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3222 .rate_limit.max_rate = max_pacing_rate,
3223 .rate_limit.flags = M_NOWAIT,
3224 };
3225 int error;
3226
3227 INP_WLOCK_ASSERT(inp);
3228
3229 /*
3230 * If there is already a send tag, or the INP is being torn
3231 * down, allocating a new send tag is not allowed. Else send
3232 * tags may leak.
3233 */
3234 if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3235 return (EINVAL);
3236
3237 error = m_snd_tag_alloc(ifp, &params, st);
3238#ifdef INET
3239 if (error == 0) {
3240 counter_u64_add(rate_limit_set_ok, 1);
3241 counter_u64_add(rate_limit_active, 1);
3242 } else if (error != EOPNOTSUPP)
3243 counter_u64_add(rate_limit_alloc_fail, 1);
3244#endif
3245 return (error);
3246}
3247
3248void
3249in_pcbdetach_tag(struct m_snd_tag *mst)
3250{
3251
3252 m_snd_tag_rele(mst);
3253#ifdef INET
3254 counter_u64_add(rate_limit_active, -1);
3255#endif
3256}
3257
3258/*
3259 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3260 * if any:
3261 */
3262void
3263in_pcbdetach_txrtlmt(struct inpcb *inp)
3264{
3265 struct m_snd_tag *mst;
3266
3267 INP_WLOCK_ASSERT(inp);
3268
3269 mst = inp->inp_snd_tag;
3270 inp->inp_snd_tag = NULL;
3271
3272 if (mst == NULL)
3273 return;
3274
3275 m_snd_tag_rele(mst);
3276#ifdef INET
3277 counter_u64_add(rate_limit_active, -1);
3278#endif
3279}
3280
3281int
3282in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3283{
3284 int error;
3285
3286 /*
3287 * If the existing send tag is for the wrong interface due to
3288 * a route change, first drop the existing tag. Set the
3289 * CHANGED flag so that we will keep trying to allocate a new
3290 * tag if we fail to allocate one this time.
3291 */
3292 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3293 in_pcbdetach_txrtlmt(inp);
3295 }
3296
3297 /*
3298 * NOTE: When attaching to a network interface a reference is
3299 * made to ensure the network interface doesn't go away until
3300 * all ratelimit connections are gone. The network interface
3301 * pointers compared below represent valid network interfaces,
3302 * except when comparing towards NULL.
3303 */
3304 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3305 error = 0;
3306 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3307 if (inp->inp_snd_tag != NULL)
3308 in_pcbdetach_txrtlmt(inp);
3309 error = 0;
3310 } else if (inp->inp_snd_tag == NULL) {
3311 /*
3312 * In order to utilize packet pacing with RSS, we need
3313 * to wait until there is a valid RSS hash before we
3314 * can proceed:
3315 */
3316 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3317 error = EAGAIN;
3318 } else {
3319 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3320 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3321 }
3322 } else {
3323 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3324 }
3325 if (error == 0 || error == EOPNOTSUPP)
3326 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3327
3328 return (error);
3329}
3330
3331/*
3332 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3333 * is set in the fast path and will attach/detach/modify the TX rate
3334 * limit send tag based on the socket's so_max_pacing_rate value.
3335 */
3336void
3337in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3338{
3339 struct socket *socket;
3340 uint32_t max_pacing_rate;
3341 bool did_upgrade;
3342 int error;
3343
3344 if (inp == NULL)
3345 return;
3346
3347 socket = inp->inp_socket;
3348 if (socket == NULL)
3349 return;
3350
3351 if (!INP_WLOCKED(inp)) {
3352 /*
3353 * NOTE: If the write locking fails, we need to bail
3354 * out and use the non-ratelimited ring for the
3355 * transmit until there is a new chance to get the
3356 * write lock.
3357 */
3358 if (!INP_TRY_UPGRADE(inp))
3359 return;
3360 did_upgrade = 1;
3361 } else {
3362 did_upgrade = 0;
3363 }
3364
3365 /*
3366 * NOTE: The so_max_pacing_rate value is read unlocked,
3367 * because atomic updates are not required since the variable
3368 * is checked at every mbuf we send. It is assumed that the
3369 * variable read itself will be atomic.
3370 */
3371 max_pacing_rate = socket->so_max_pacing_rate;
3372
3373 error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3374
3375 if (did_upgrade)
3376 INP_DOWNGRADE(inp);
3377}
3378
3379/*
3380 * Track route changes for TX rate limiting.
3381 */
3382void
3383in_pcboutput_eagain(struct inpcb *inp)
3384{
3385 bool did_upgrade;
3386
3387 if (inp == NULL)
3388 return;
3389
3390 if (inp->inp_snd_tag == NULL)
3391 return;
3392
3393 if (!INP_WLOCKED(inp)) {
3394 /*
3395 * NOTE: If the write locking fails, we need to bail
3396 * out and use the non-ratelimited ring for the
3397 * transmit until there is a new chance to get the
3398 * write lock.
3399 */
3400 if (!INP_TRY_UPGRADE(inp))
3401 return;
3402 did_upgrade = 1;
3403 } else {
3404 did_upgrade = 0;
3405 }
3406
3407 /* detach rate limiting */
3408 in_pcbdetach_txrtlmt(inp);
3409
3410 /* make sure new mbuf send tag allocation is made */
3412
3413 if (did_upgrade)
3414 INP_DOWNGRADE(inp);
3415}
3416
3417#ifdef INET
3418static void
3419rl_init(void *st)
3420{
3421 rate_limit_new = counter_u64_alloc(M_WAITOK);
3422 rate_limit_chg = counter_u64_alloc(M_WAITOK);
3423 rate_limit_active = counter_u64_alloc(M_WAITOK);
3424 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3425 rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3426}
3427
3428SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3429#endif
3430#endif /* RATELIMIT */
SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, &parse_http_version, 1, "Parse http version so that non 1.x requests work")
static SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW|CTLFLAG_MPSAFE, 0, "HTTP accept filter")
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLFLAG_VNET|CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE, NULL, 0, cc_default_algo, "A", "Default congestion control algorithm")
__uint32_t uint32_t
Definition: in.h:62
__uint16_t uint16_t
Definition: in.h:57
__uint8_t uint8_t
Definition: in.h:52
char * inet_ntoa_r(struct in_addr ina, char *buf)
#define INADDR_BROADCAST
Definition: in.h:49
#define INADDR_ANY
Definition: in.h:48
#define sintosa(sin)
Definition: in.h:679
uint16_t in_port_t
Definition: in.h:72
uint32_t in_addr_t
Definition: in.h:67
#define ifatoia(ifa)
Definition: in.h:680
#define IPPROTO_IP
Definition: in.h:43
#define satosin(sa)
Definition: in.h:678
struct nhop_object * fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid)
uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst, unsigned short src_port, unsigned short dst_port, char proto, uint32_t *phashtype)
int prison_local_ip4(struct ucred *cred, struct in_addr *ia)
Definition: in_jail.c:220
int prison_check_ip4(const struct ucred *cred, const struct in_addr *ia)
Definition: in_jail.c:322
int prison_equal_ip4(struct prison *pr1, struct prison *pr2)
Definition: in_jail.c:186
int prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
Definition: in_jail.c:155
int prison_get_ip4(struct ucred *cred, struct in_addr *ia)
Definition: in_jail.c:122
int prison_check_ip4_locked(const struct prison *pr, const struct in_addr *ia)
Definition: in_jail.c:312
int in_leavegroup_locked(struct in_multi *inm, struct in_mfilter *imf)
Definition: in_mcast.c:1311
void inp_freemoptions(struct ip_moptions *imo)
Definition: in_mcast.c:1593
void ip_mfilter_free(struct in_mfilter *imf)
Definition: in_mcast.c:351
static void ipport_tick(void *xtp)
Definition: in_pcb.c:2613
bool inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
Definition: in_pcb.c:1578
struct inpcb * inp_next(struct inpcb_iterator *ii)
Definition: in_pcb.c:1655
static void in_pcblbgroup_free_deferred(epoch_context_t ctx)
Definition: in_pcb.c:291
bool in_pcbrele_rlocked(struct inpcb *inp)
Definition: in_pcb.c:1775
static void in_pcbhashseed_init(void)
Definition: in_pcb.c:253
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp)
Definition: in_pcb.c:2743
int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
Definition: in_pcb.c:603
void in_pcbrehash(struct inpcb *inp)
Definition: in_pcb.c:2546
VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, 0)
struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp)
Definition: in_pcb.c:2721
struct inpcb * so_sotoinpcb(struct socket *so)
Definition: in_pcb.c:2755
static void inpcb_fini(void *, int)
Definition: in_pcb.c:1906
SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL)
static void inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
Definition: in_pcb.c:1554
static void in_pcbremlbgrouphash(struct inpcb *inp)
Definition: in_pcb.c:435
void inp_ip_tos_set(struct inpcb *inp, int val)
Definition: in_pcb.c:2736
VNET_DEFINE_STATIC(int, ipport_tcplastcount)
static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i)
Definition: in_pcb.c:336
static int inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
Definition: in_pcb.c:1562
CTASSERT(sizeof(struct inpcbhead)==sizeof(LIST_HEAD(, inpcb)))
static bool in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
Definition: in_pcb.c:1570
static void inp_lock(struct inpcb *inp, const inp_lookup_t lock)
Definition: in_pcb.c:1546
static void ip_fini(void *xtp)
Definition: in_pcb.c:2634
void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
Definition: in_pcb.c:552
struct socket * inp_inpcbtosocket(struct inpcb *inp)
Definition: in_pcb.c:2713
static struct callout ipport_tick_callout
Definition: in_pcb.c:120
void inp_wunlock(struct inpcb *inp)
Definition: in_pcb.c:2665
VNET_DEFINE(int, ipport_lowfirstauto)
void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
Definition: in_pcb.c:2777
void inp_apply_all(void(*func)(struct inpcb *, void *), void *arg)
Definition: in_pcb.c:2702
#define II_LIST_FIRST(ipi, hash)
Definition: in_pcb.c:1643
void in_pcbsosetlabel(struct socket *so)
Definition: in_pcb.c:2589
void in_pcbstorage_init(void *arg)
Definition: in_pcb.c:573
static int in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
Definition: in_pcb.c:359
#define V_ipport_tcplastcount
Definition: in_pcb.c:149
#define INPCBLBGROUP_SIZMAX
Definition: in_pcb.c:117
void in_pcbstorage_destroy(void *arg)
Definition: in_pcb.c:590
static void in_pcblbgroup_free(struct inpcblbgroup *grp)
Definition: in_pcb.c:300
#define II_LOCK_ASSERT(inp, lock)
Definition: in_pcb.c:1651
static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, uint8_t numa_domain)
Definition: in_pcb.c:270
static void inpcb_dtor(void *, int, void *)
Definition: in_pcb.c:1891
__FBSDID("$FreeBSD$")
void inp_wlock(struct inpcb *inp)
Definition: in_pcb.c:2658
static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size)
Definition: in_pcb.c:308
struct tcpcb * so_sototcpcb(struct socket *so)
Definition: in_pcb.c:2762
#define II_LIST_NEXT(inp, hash)
Definition: in_pcb.c:1647
void inp_runlock(struct inpcb *inp)
Definition: in_pcb.c:2679
void in_pcbdrop(struct inpcb *inp)
Definition: in_pcb.c:1928
void in_pcbref(struct inpcb *inp)
Definition: in_pcb.c:1762
int in_pcblbgroup_numa(struct inpcb *inp, int arg)
Definition: in_pcb.c:467
int in_pcbinshash(struct inpcb *inp)
Definition: in_pcb.c:2469
int inp_ip_tos_get(const struct inpcb *inp)
Definition: in_pcb.c:2729
void in_pcbdetach(struct inpcb *inp)
Definition: in_pcb.c:1524
static void ipport_tick_init(const void *unused __unused)
Definition: in_pcb.c:2645
void in_losing(struct inpcb *inp)
Definition: in_pcb.c:2577
bool in_pcbrele_wlocked(struct inpcb *inp)
Definition: in_pcb.c:1792
void in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, u_int hash_nelements, u_int porthash_nelements)
Definition: in_pcb.c:524
void in_pcbfree(struct inpcb *inp)
Definition: in_pcb.c:1818
int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
Definition: in_pcb.c:904
#define INPCBLBGROUP_SIZMIN
Definition: in_pcb.c:116
#define INP_FREED
Definition: in_pcb.c:118
void inp_rlock(struct inpcb *inp)
Definition: in_pcb.c:2672
int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int(*ctloutput_set)(struct inpcb *, struct sockopt *))
Definition: in_pcb.c:2803
#define INP_RECVRETOPTS
Definition: in_pcb.h:621
#define V_ipport_hilastauto
Definition: in_pcb.h:729
#define INP_TRY_UPGRADE(inp)
Definition: in_pcb.h:524
#define INP_SOCKAF(so)
Definition: in_pcb.h:703
#define INP_DONTFRAG
Definition: in_pcb.h:631
#define INP_PCBHASH_WILD(lport, mask)
Definition: in_pcb.h:600
#define INP_LOCK_ASSERT(inp)
Definition: in_pcb.h:527
#define V_ipport_randomized
Definition: in_pcb.h:730
#define IN6P_IPV6_V6ONLY
Definition: in_pcb.h:635
#define INP_HASH_WLOCK_ASSERT(ipi)
Definition: in_pcb.h:576
#define INP_ORIGDSTADDR
Definition: in_pcb.h:674
#define IN6P_TCLASS
Definition: in_pcb.h:642
#define INP_IPV4
Definition: in_pcb.h:613
#define INC_ISIPV6
Definition: in_pcb.h:124
#define V_ipport_lowlastauto
Definition: in_pcb.h:725
#define IN6P_HOPOPTS
Definition: in_pcb.h:638
#define INP_WLOCK(inp)
Definition: in_pcb.h:518
#define INP_HASH_WUNLOCK(ipi)
Definition: in_pcb.h:573
#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport)
Definition: in_pcb.h:603
#define IN6P_MTU
Definition: in_pcb.h:651
#define V_ipport_lastauto
Definition: in_pcb.h:727
#define V_ipport_lowfirstauto
Definition: in_pcb.h:724
#define V_ipport_reservedlow
Definition: in_pcb.h:723
#define INP_HIGHPORT
Definition: in_pcb.h:624
void in_pcbdisconnect(struct inpcb *)
#define IN6P_DSTOPTS
Definition: in_pcb.h:639
#define INP_IPV6PROTO
Definition: in_pcb.h:615
struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *)
#define INP_RUNLOCK(inp)
Definition: in_pcb.h:521
#define INP_HASH_LOCK_ASSERT(ipi)
Definition: in_pcb.h:574
#define INP_RECVTTL
Definition: in_pcb.h:630
#define INP_DOWNGRADE(inp)
Definition: in_pcb.h:525
#define INP_ALL_ITERATOR(_ipi, _lock)
Definition: in_pcb.h:795
#define INP_PCBPORTHASH(lport, mask)
Definition: in_pcb.h:608
int in_getsockaddr(struct socket *so, struct sockaddr **nam)
#define V_ipport_reservedhigh
Definition: in_pcb.h:722
#define sotoinpcb(so)
Definition: in_pcb.h:701
#define INP_RECVTOS
Definition: in_pcb.h:634
#define IN6P_RTHDR
Definition: in_pcb.h:640
#define V_ipport_randomcps
Definition: in_pcb.h:731
int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *)
int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *)
#define V_ipport_stoprandom
Definition: in_pcb.h:733
#define inp_lock_assert(inp)
Definition: in_pcb.h:546
int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *)
#define inp_unlock_assert(inp)
Definition: in_pcb.h:547
#define INP_WLOCK_ASSERT(inp)
Definition: in_pcb.h:529
#define INPLOOKUP_MASK
Definition: in_pcb.h:697
#define INP6_PCBHASH(faddr, lport, fport, mask)
Definition: in_pcb.h:597
#define INP_BINDMULTI
Definition: in_pcb.h:669
#define INP_INHASHLIST
Definition: in_pcb.h:633
#define INP_ONESBCAST
Definition: in_pcb.h:645
#define INP_INFO_WUNLOCK(ipi)
Definition: in_pcb.h:565
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *)
int in_getpeeraddr(struct socket *so, struct sockaddr **nam)
#define INP_TIMEWAIT
Definition: in_pcb.h:644
#define IN6P_PKTINFO
Definition: in_pcb.h:636
#define IN6P_RFC2292
Definition: in_pcb.h:650
#define V_ipport_tcpallocs
Definition: in_pcb.h:734
#define INP_MTUDISC
Definition: in_pcb.h:628
bool inp_match_t(const struct inpcb *, void *)
Definition: in_pcb.h:775
#define INP_REUSEPORT_LB
Definition: in_pcb.h:676
int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *, bool)
#define INP_WLOCKED(inp)
Definition: in_pcb.h:526
#define INP_RECVDSTADDR
Definition: in_pcb.h:622
#define INP_HASH_WLOCK(ipi)
Definition: in_pcb.h:572
#define V_ipport_randomtime
Definition: in_pcb.h:732
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *)
#define INP_RATE_LIMIT_CHANGED
Definition: in_pcb.h:673
#define V_ipport_hifirstauto
Definition: in_pcb.h:728
#define INPLOOKUP_LOCKMASK
Definition: in_pcb.h:699
#define INP_HDRINCL
Definition: in_pcb.h:623
#define INP_BINDANY
Definition: in_pcb.h:632
#define V_ipport_firstauto
Definition: in_pcb.h:726
#define inp_zero_size
Definition: in_pcb.h:223
struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *)
#define INP_UNLOCK_ASSERT(inp)
Definition: in_pcb.h:530
#define INP_DROPPED
Definition: in_pcb.h:646
#define INP_RECVOPTS
Definition: in_pcb.h:620
#define IN6P_AUTOFLOWLABEL
Definition: in_pcb.h:643
#define INP_WUNLOCK(inp)
Definition: in_pcb.h:522
int inp_so_options(const struct inpcb *inp)
#define INP_ANONPORT
Definition: in_pcb.h:626
#define INP_INFO_WLOCK(ipi)
Definition: in_pcb.h:563
#define INP_CHECK_SOCKAF(so, af)
Definition: in_pcb.h:705
#define IN6P_RTHDRDSTOPTS
Definition: in_pcb.h:641
#define INP_REUSEADDR
Definition: in_pcb.h:668
#define IN6P_HOPLIMIT
Definition: in_pcb.h:637
#define INP_REUSEPORT
Definition: in_pcb.h:666
#define INP_LOWPORT
Definition: in_pcb.h:625
#define INP_RLOCK(inp)
Definition: in_pcb.h:517
inp_lookup_t
Definition: in_pcb.h:691
@ INPLOOKUP_RLOCKPCB
Definition: in_pcb.h:693
@ INPLOOKUP_WLOCKPCB
Definition: in_pcb.h:694
@ INPLOOKUP_WILDCARD
Definition: in_pcb.h:692
#define INP_LOCK_DESTROY(inp)
Definition: in_pcb.h:516
#define INP_RLOCK_ASSERT(inp)
Definition: in_pcb.h:528
#define INP_IPV6
Definition: in_pcb.h:614
struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr)
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int))
#define INP_SOCKREF
Definition: in_pcb.h:647
#define INP_ITERATOR(_ipi, _lock, _match, _ctx)
Definition: in_pcb.h:787
#define INP_PCBHASH(faddr, lport, fport, mask)
Definition: in_pcb.h:595
#define INP_RECVIF
Definition: in_pcb.h:627
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int)
#define V_in_pcbhashseed
Definition: in_pcb_var.h:48
struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *)
int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
#define IA_SIN(ia)
Definition: in_var.h:96
#define IN_MULTI_LOCK_ASSERT()
Definition: in_var.h:381
#define V_in_ifaddrhead
Definition: in_var.h:118
#define IP_MFILTER_FOREACH(imf, head)
Definition: in_var.h:263
static void ip_mfilter_remove(struct ip_mfilter_head *head, struct in_mfilter *imf)
Definition: in_var.h:257
#define ia_ifp
Definition: in_var.h:77
#define ia_broadaddr
Definition: in_var.h:86
static LIST_HEAD(carp_softc)
Definition: ip_carp.c:333
ipfw_dyn_rule * next
Definition: ip_fw.h:0
u_int16_t count
Definition: ip_fw.h:18
Definition: in.h:83
in_addr_t s_addr
Definition: in.h:84
u_int8_t inc_flags
Definition: in_pcb.h:114
u_int16_t inc_fibnum
Definition: in_pcb.h:116
struct in_endpoints inc_ie
Definition: in_pcb.h:118
union in_dependaddr ie_dependladdr
Definition: in_pcb.h:101
struct sockaddr_in ia_addr
Definition: in_var.h:84
struct in_multi * imf_inm
Definition: in_var.h:223
struct ifnet * inm_ifp
Definition: in_var.h:302
inp_match_t * match
Definition: in_pcb.h:779
struct inpcb * inp
Definition: in_pcb.h:778
const inp_lookup_t lock
Definition: in_pcb.h:783
const struct inpcbinfo * ipi
Definition: in_pcb.h:777
void * ctx
Definition: in_pcb.h:780
Definition: in_pcb.h:217
uint32_t inp_rss_listen_bucket
Definition: in_pcb.h:267
struct socket * inp_socket
Definition: in_pcb.h:254
struct inpcbport * inp_phd
Definition: in_pcb.h:296
struct icmp6_filter * in6p_icmp6filt
Definition: in_pcb.h:290
struct ip6_pktopts * in6p_outputopts
Definition: in_pcb.h:286
inp_gen_t inp_gencnt
Definition: in_pcb.h:297
u_char inp_ip_minttl
Definition: in_pcb.h:263
short in6p_hops
Definition: in_pcb.h:293
struct route inp_route
Definition: in_pcb.h:301
struct rwlock inp_lock
Definition: in_pcb.h:220
volatile uint16_t inp_hpts_cpu
Definition: in_pcb.h:243
struct ucred * inp_cred
Definition: in_pcb.h:258
uint32_t inp_in_hpts
Definition: in_pcb.h:241
u_char inp_ip_p
Definition: in_pcb.h:262
int inp_flags
Definition: in_pcb.h:246
struct inpcbpolicy * inp_sp
Definition: in_pcb.h:274
int inp_flags2
Definition: in_pcb.h:247
u_char inp_vflag
Definition: in_pcb.h:260
u_int32_t inp_flow
Definition: in_pcb.h:259
uint32_t inp_flowtype
Definition: in_pcb.h:266
u_int inp_refcount
Definition: in_pcb.h:245
struct mbuf * inp_options
Definition: in_pcb.h:279
u_char inp_ip_ttl
Definition: in_pcb.h:261
struct m_snd_tag * inp_snd_tag
Definition: in_pcb.h:265
struct ip6_moptions * in6p_moptions
Definition: in_pcb.h:288
void * inp_ppcb
Definition: in_pcb.h:253
struct ip_moptions * inp_moptions
Definition: in_pcb.h:280
uint8_t inp_numa_domain
Definition: in_pcb.h:252
struct label * inp_label
Definition: in_pcb.h:273
u_char inp_ip_tos
Definition: in_pcb.h:278
uint32_t inp_flowid
Definition: in_pcb.h:264
int in6p_cksum
Definition: in_pcb.h:292
struct inpcbinfo * inp_pcbinfo
Definition: in_pcb.h:257
struct mbuf * in6p_options
Definition: in_pcb.h:284
struct in_conninfo inp_inc
Definition: in_pcb.h:270
struct inpcbhead * ipi_hashbase
Definition: in_pcb.h:438
struct vnet * ipi_vnet
Definition: in_pcb.h:457
uma_zone_t ipi_portzone
Definition: in_pcb.h:430
struct mtx ipi_hash_lock
Definition: in_pcb.h:437
u_quad_t ipi_gencnt
Definition: in_pcb.h:417
smr_t ipi_smr
Definition: in_pcb.h:431
struct inpcblbgrouphead * ipi_lbgrouphashbase
Definition: in_pcb.h:451
u_long ipi_lbgrouphashmask
Definition: in_pcb.h:452
u_short ipi_lastport
Definition: in_pcb.h:422
uma_zone_t ipi_zone
Definition: in_pcb.h:429
struct inpcbporthead * ipi_porthashbase
Definition: in_pcb.h:444
struct inpcbhead ipi_listhead
Definition: in_pcb.h:410
u_long ipi_hashmask
Definition: in_pcb.h:439
u_long ipi_porthashmask
Definition: in_pcb.h:445
u_short ipi_lasthi
Definition: in_pcb.h:424
struct mtx ipi_lock
Definition: in_pcb.h:409
u_int ipi_count
Definition: in_pcb.h:411
u_short ipi_lastlow
Definition: in_pcb.h:423
uint32_t il_inpcnt
Definition: in_pcb.h:512
u_int8_t il_numa_domain
Definition: in_pcb.h:506
uint16_t il_lport
Definition: in_pcb.h:504
union in_dependaddr il_dependladdr
Definition: in_pcb.h:508
u_char il_vflag
Definition: in_pcb.h:505
struct inpcb * il_inp[]
Definition: in_pcb.h:513
uint32_t il_inpsiz
Definition: in_pcb.h:511
struct epoch_context il_epoch_ctx
Definition: in_pcb.h:503
u_short phd_port
Definition: in_pcb_var.h:62
struct inpcbhead phd_pcblist
Definition: in_pcb_var.h:60
uma_zone_t ips_zone
Definition: in_pcb.h:465
const char * ips_zone_name
Definition: in_pcb.h:468
uma_zone_t ips_portzone
Definition: in_pcb.h:466
uma_init ips_pcbinit
Definition: in_pcb.h:467
const char * ips_portzone_name
Definition: in_pcb.h:469
const char * ips_hashlock_name
Definition: in_pcb.h:471
const char * ips_infolock_name
Definition: in_pcb.h:470
Definition: in.h:97
char sin_zero[8]
Definition: in.h:102
struct in_addr sin_addr
Definition: in.h:101
uint8_t sin_len
Definition: in.h:98
sa_family_t sin_family
Definition: in.h:99
in_port_t sin_port
Definition: in.h:100
Definition: tcp_var.h:132
Definition: tcp_var.h:629
short tw_so_options
Definition: tcp_var.h:636
#define TCP_REUSPORT_LB_NUMA_CURDOM
Definition: tcp.h:440
#define TCP_REUSPORT_LB_NUMA_NODOM
Definition: tcp.h:439
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, &hpts_hopelessly_behind, "Number of times hpts could not catch up and was behind hopelessly")
uint16_t hpts_random_cpu(struct inpcb *inp)
Definition: tcp_hpts.c:994
#define sototcpcb(so)
Definition: tcp_var.h:647
#define intotw(ip)
Definition: tcp_var.h:646
#define V_tcbinfo
Definition: tcp_var.h:1030
#define V_udbinfo
Definition: udp_var.h:144
#define V_ulitecbinfo
Definition: udp_var.h:145