FreeBSD kernel kern code
uipc_socket.c
Go to the documentation of this file.
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California.
6 * Copyright (c) 2004 The FreeBSD Foundation
7 * Copyright (c) 2004-2008 Robert N. M. Watson
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
35 */
36
37/*
38 * Comments on the socket life cycle:
39 *
40 * soalloc() sets of socket layer state for a socket, called only by
41 * socreate() and sonewconn(). Socket layer private.
42 *
43 * sodealloc() tears down socket layer state for a socket, called only by
44 * sofree() and sonewconn(). Socket layer private.
45 *
46 * pru_attach() associates protocol layer state with an allocated socket;
47 * called only once, may fail, aborting socket allocation. This is called
48 * from socreate() and sonewconn(). Socket layer private.
49 *
50 * pru_detach() disassociates protocol layer state from an attached socket,
51 * and will be called exactly once for sockets in which pru_attach() has
52 * been successfully called. If pru_attach() returned an error,
53 * pru_detach() will not be called. Socket layer private.
54 *
55 * pru_abort() and pru_close() notify the protocol layer that the last
56 * consumer of a socket is starting to tear down the socket, and that the
57 * protocol should terminate the connection. Historically, pru_abort() also
58 * detached protocol state from the socket state, but this is no longer the
59 * case.
60 *
61 * socreate() creates a socket and attaches protocol state. This is a public
62 * interface that may be used by socket layer consumers to create new
63 * sockets.
64 *
65 * sonewconn() creates a socket and attaches protocol state. This is a
66 * public interface that may be used by protocols to create new sockets when
67 * a new connection is received and will be available for accept() on a
68 * listen socket.
69 *
70 * soclose() destroys a socket after possibly waiting for it to disconnect.
71 * This is a public interface that socket consumers should use to close and
72 * release a socket when done with it.
73 *
74 * soabort() destroys a socket without waiting for it to disconnect (used
75 * only for incoming connections that are already partially or fully
76 * connected). This is used internally by the socket layer when clearing
77 * listen socket queues (due to overflow or close on the listen socket), but
78 * is also a public interface protocols may use to abort connections in
79 * their incomplete listen queues should they no longer be required. Sockets
80 * placed in completed connection listen queues should not be aborted for
81 * reasons described in the comment above the soclose() implementation. This
82 * is not a general purpose close routine, and except in the specific
83 * circumstances described here, should not be used.
84 *
85 * sofree() will free a socket and its protocol state if all references on
86 * the socket have been released, and is the public interface to attempt to
87 * free a socket when a reference is removed. This is a socket layer private
88 * interface.
89 *
90 * NOTE: In addition to socreate() and soclose(), which provide a single
91 * socket reference to the consumer to be managed as required, there are two
92 * calls to explicitly manage socket references, soref(), and sorele().
93 * Currently, these are generally required only when transitioning a socket
94 * from a listen queue to a file descriptor, in order to prevent garbage
95 * collection of the socket at an untimely moment. For a number of reasons,
96 * these interfaces are not preferred, and should be avoided.
97 *
98 * NOTE: With regard to VNETs the general rule is that callers do not set
99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
101 * and sorflush(), which are usually called from a pre-set VNET context.
102 * sopoll() currently does not need a VNET context to be set.
103 */
104
105#include <sys/cdefs.h>
106__FBSDID("$FreeBSD$");
107
108#include "opt_inet.h"
109#include "opt_inet6.h"
110#include "opt_kern_tls.h"
111#include "opt_sctp.h"
112
113#include <sys/param.h>
114#include <sys/systm.h>
115#include <sys/capsicum.h>
116#include <sys/fcntl.h>
117#include <sys/limits.h>
118#include <sys/lock.h>
119#include <sys/mac.h>
120#include <sys/malloc.h>
121#include <sys/mbuf.h>
122#include <sys/mutex.h>
123#include <sys/domain.h>
124#include <sys/file.h> /* for struct knote */
125#include <sys/hhook.h>
126#include <sys/kernel.h>
127#include <sys/khelp.h>
128#include <sys/ktls.h>
129#include <sys/event.h>
130#include <sys/eventhandler.h>
131#include <sys/poll.h>
132#include <sys/proc.h>
133#include <sys/protosw.h>
134#include <sys/sbuf.h>
135#include <sys/socket.h>
136#include <sys/socketvar.h>
137#include <sys/resourcevar.h>
138#include <net/route.h>
139#include <sys/signalvar.h>
140#include <sys/stat.h>
141#include <sys/sx.h>
142#include <sys/sysctl.h>
143#include <sys/taskqueue.h>
144#include <sys/uio.h>
145#include <sys/un.h>
146#include <sys/unpcb.h>
147#include <sys/jail.h>
148#include <sys/syslog.h>
149#include <netinet/in.h>
150#include <netinet/in_pcb.h>
151#include <netinet/tcp.h>
152
153#include <net/vnet.h>
154
155#include <security/mac/mac_framework.h>
156
157#include <vm/uma.h>
158
159#ifdef COMPAT_FREEBSD32
160#include <sys/mount.h>
161#include <sys/sysent.h>
162#include <compat/freebsd32/freebsd32.h>
163#endif
164
165static int soreceive_rcvoob(struct socket *so, struct uio *uio,
166 int flags);
167static void so_rdknl_lock(void *);
168static void so_rdknl_unlock(void *);
169static void so_rdknl_assert_lock(void *, int);
170static void so_wrknl_lock(void *);
171static void so_wrknl_unlock(void *);
172static void so_wrknl_assert_lock(void *, int);
173
174static void filt_sordetach(struct knote *kn);
175static int filt_soread(struct knote *kn, long hint);
176static void filt_sowdetach(struct knote *kn);
177static int filt_sowrite(struct knote *kn, long hint);
178static int filt_soempty(struct knote *kn, long hint);
179static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
180fo_kqfilter_t soo_kqfilter;
181
182static struct filterops soread_filtops = {
183 .f_isfd = 1,
184 .f_detach = filt_sordetach,
185 .f_event = filt_soread,
186};
187static struct filterops sowrite_filtops = {
188 .f_isfd = 1,
189 .f_detach = filt_sowdetach,
190 .f_event = filt_sowrite,
191};
192static struct filterops soempty_filtops = {
193 .f_isfd = 1,
194 .f_detach = filt_sowdetach,
195 .f_event = filt_soempty,
196};
197
198so_gen_t so_gencnt; /* generation count for sockets */
199
200MALLOC_DEFINE(M_SONAME, "soname", "socket name");
201MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
202
203#define VNET_SO_ASSERT(so) \
204 VNET_ASSERT(curvnet != NULL, \
205 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
206
207VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
208#define V_socket_hhh VNET(socket_hhh)
209
210/*
211 * Limit on the number of connections in the listen queue waiting
212 * for accept(2).
213 * NB: The original sysctl somaxconn is still available but hidden
214 * to prevent confusion about the actual purpose of this number.
215 */
216static u_int somaxconn = SOMAXCONN;
217
218static int
219sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
220{
221 int error;
222 int val;
223
224 val = somaxconn;
225 error = sysctl_handle_int(oidp, &val, 0, req);
226 if (error || !req->newptr )
227 return (error);
228
229 /*
230 * The purpose of the UINT_MAX / 3 limit, is so that the formula
231 * 3 * so_qlimit / 2
232 * below, will not overflow.
233 */
234
235 if (val < 1 || val > UINT_MAX / 3)
236 return (EINVAL);
237
238 somaxconn = val;
239 return (0);
240}
241SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
243 sysctl_somaxconn, "I",
244 "Maximum listen socket pending connection accept queue size");
245SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
246 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
247 sizeof(int), sysctl_somaxconn, "I",
248 "Maximum listen socket pending connection accept queue size (compat)");
249
250static int numopensockets;
251SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
252 &numopensockets, 0, "Number of open sockets");
253
254/*
255 * accept_mtx locks down per-socket fields relating to accept queues. See
256 * socketvar.h for an annotation of the protected fields of struct socket.
257 */
259MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
260
261/*
262 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
263 * so_gencnt field.
264 */
265static struct mtx so_global_mtx;
266MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
267
268/*
269 * General IPC sysctl name space, used by sockets and a variety of other IPC
270 * types.
271 */
272SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
273 "IPC");
274
275/*
276 * Initialize the socket subsystem and set up the socket
277 * memory allocator.
278 */
279static uma_zone_t socket_zone;
281
282static void
284{
285
286 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
287}
288
289static void
291{
292
293 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
294 &V_socket_hhh[subtype],
295 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
296 printf("%s: WARNING: unable to register hook\n", __func__);
297}
298
299static void
301{
302
303 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
304 printf("%s: WARNING: unable to deregister hook\n", __func__);
305}
306
307static void
308socket_init(void *tag)
309{
310
311 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
312 NULL, NULL, UMA_ALIGN_PTR, 0);
313 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
314 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
315 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
316 EVENTHANDLER_PRI_FIRST);
317}
318SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
319
320static void
321socket_vnet_init(const void *unused __unused)
322{
323 int i;
324
325 /* We expect a contiguous range */
326 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
328}
329VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
330 socket_vnet_init, NULL);
331
332static void
333socket_vnet_uninit(const void *unused __unused)
334{
335 int i;
336
337 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
339}
340VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
341 socket_vnet_uninit, NULL);
342
343/*
344 * Initialise maxsockets. This SYSINIT must be run after
345 * tunable_mbinit().
346 */
347static void
348init_maxsockets(void *ignored)
349{
350
351 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
353}
354SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
355
356/*
357 * Sysctl to get and set the maximum global sockets limit. Notify protocols
358 * of the change so that they can update their dependent limits as required.
359 */
360static int
361sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
362{
363 int error, newmaxsockets;
364
365 newmaxsockets = maxsockets;
366 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
367 if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
368 if (newmaxsockets > maxsockets &&
369 newmaxsockets <= maxfiles) {
370 maxsockets = newmaxsockets;
371 EVENTHANDLER_INVOKE(maxsockets_change);
372 } else
373 error = EINVAL;
374 }
375 return (error);
376}
377SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
378 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0,
379 sysctl_maxsockets, "IU",
380 "Maximum number of sockets available");
381
382/*
383 * Socket operation routines. These routines are called by the routines in
384 * sys_socket.c or from a system process, and implement the semantics of
385 * socket operations by switching out to the protocol specific routines.
386 */
387
388/*
389 * Get a socket structure from our zone, and initialize it. Note that it
390 * would probably be better to allocate socket and PCB at the same time, but
391 * I'm not convinced that all the protocols can be easily modified to do
392 * this.
393 *
394 * soalloc() returns a socket with a ref count of 0.
395 */
396static struct socket *
397soalloc(struct vnet *vnet)
398{
399 struct socket *so;
400
401 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
402 if (so == NULL)
403 return (NULL);
404#ifdef MAC
405 if (mac_socket_init(so, M_NOWAIT) != 0) {
406 uma_zfree(socket_zone, so);
407 return (NULL);
408 }
409#endif
410 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
411 uma_zfree(socket_zone, so);
412 return (NULL);
413 }
414
415 /*
416 * The socket locking protocol allows to lock 2 sockets at a time,
417 * however, the first one must be a listening socket. WITNESS lacks
418 * a feature to change class of an existing lock, so we use DUPOK.
419 */
420 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
421 so->so_snd.sb_mtx = &so->so_snd_mtx;
422 so->so_rcv.sb_mtx = &so->so_rcv_mtx;
423 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
424 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
425 so->so_rcv.sb_sel = &so->so_rdsel;
426 so->so_snd.sb_sel = &so->so_wrsel;
427 sx_init(&so->so_snd_sx, "so_snd_sx");
428 sx_init(&so->so_rcv_sx, "so_rcv_sx");
429 TAILQ_INIT(&so->so_snd.sb_aiojobq);
430 TAILQ_INIT(&so->so_rcv.sb_aiojobq);
431 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
432 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
433#ifdef VIMAGE
434 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
435 __func__, __LINE__, so));
436 so->so_vnet = vnet;
437#endif
438 /* We shouldn't need the so_global_mtx */
439 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
440 /* Do we need more comprehensive error returns? */
441 uma_zfree(socket_zone, so);
442 return (NULL);
443 }
444 mtx_lock(&so_global_mtx);
445 so->so_gencnt = ++so_gencnt;
447#ifdef VIMAGE
448 vnet->vnet_sockcnt++;
449#endif
450 mtx_unlock(&so_global_mtx);
451
452 return (so);
453}
454
455/*
456 * Free the storage associated with a socket at the socket layer, tear down
457 * locks, labels, etc. All protocol state is assumed already to have been
458 * torn down (and possibly never set up) by the caller.
459 */
460static void
461sodealloc(struct socket *so)
462{
463
464 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
465 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
466
467 mtx_lock(&so_global_mtx);
468 so->so_gencnt = ++so_gencnt;
469 --numopensockets; /* Could be below, but faster here. */
470#ifdef VIMAGE
471 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
472 __func__, __LINE__, so));
473 so->so_vnet->vnet_sockcnt--;
474#endif
475 mtx_unlock(&so_global_mtx);
476#ifdef MAC
477 mac_socket_destroy(so);
478#endif
479 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
480
481 khelp_destroy_osd(&so->osd);
482 if (SOLISTENING(so)) {
483 if (so->sol_accept_filter != NULL)
484 accept_filt_setopt(so, NULL);
485 } else {
486 if (so->so_rcv.sb_hiwat)
487 (void)chgsbsize(so->so_cred->cr_uidinfo,
488 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
489 if (so->so_snd.sb_hiwat)
490 (void)chgsbsize(so->so_cred->cr_uidinfo,
491 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
492 sx_destroy(&so->so_snd_sx);
493 sx_destroy(&so->so_rcv_sx);
494 SOCKBUF_LOCK_DESTROY(&so->so_snd);
495 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
496 }
497 crfree(so->so_cred);
498 mtx_destroy(&so->so_lock);
499 uma_zfree(socket_zone, so);
500}
501
502/*
503 * socreate returns a socket with a ref count of 1. The socket should be
504 * closed with soclose().
505 */
506int
507socreate(int dom, struct socket **aso, int type, int proto,
508 struct ucred *cred, struct thread *td)
509{
510 struct protosw *prp;
511 struct socket *so;
512 int error;
513
514 if (proto)
515 prp = pffindproto(dom, proto, type);
516 else
517 prp = pffindtype(dom, type);
518
519 if (prp == NULL) {
520 /* No support for domain. */
521 if (pffinddomain(dom) == NULL)
522 return (EAFNOSUPPORT);
523 /* No support for socket type. */
524 if (proto == 0 && type != 0)
525 return (EPROTOTYPE);
526 return (EPROTONOSUPPORT);
527 }
528 if (prp->pr_usrreqs->pru_attach == NULL ||
529 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
530 return (EPROTONOSUPPORT);
531
532 if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0)
533 return (ECAPMODE);
534
535 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
536 return (EPROTONOSUPPORT);
537
538 if (prp->pr_type != type)
539 return (EPROTOTYPE);
540 so = soalloc(CRED_TO_VNET(cred));
541 if (so == NULL)
542 return (ENOBUFS);
543
544 so->so_type = type;
545 so->so_cred = crhold(cred);
546 if ((prp->pr_domain->dom_family == PF_INET) ||
547 (prp->pr_domain->dom_family == PF_INET6) ||
548 (prp->pr_domain->dom_family == PF_ROUTE))
549 so->so_fibnum = td->td_proc->p_fibnum;
550 else
551 so->so_fibnum = 0;
552 so->so_proto = prp;
553#ifdef MAC
554 mac_socket_create(cred, so);
555#endif
556 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
558 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
560 /*
561 * Auto-sizing of socket buffers is managed by the protocols and
562 * the appropriate flags must be set in the pru_attach function.
563 */
564 CURVNET_SET(so->so_vnet);
565 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
566 CURVNET_RESTORE();
567 if (error) {
568 sodealloc(so);
569 return (error);
570 }
571 soref(so);
572 *aso = so;
573 return (0);
574}
575
576#ifdef REGRESSION
577static int regression_sonewconn_earlytest = 1;
578SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
579 &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
580#endif
581
582static struct timeval overinterval = { 60, 0 };
583SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
585 "Delay in seconds between warnings for listen socket overflows");
586
587/*
588 * When an attempt at a new connection is noted on a socket which accepts
589 * connections, sonewconn is called. If the connection is possible (subject
590 * to space constraints, etc.) then we allocate a new structure, properly
591 * linked into the data structure of the original socket, and return this.
592 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
593 *
594 * Note: the ref count on the socket is 0 on return.
595 */
596struct socket *
597sonewconn(struct socket *head, int connstatus)
598{
599 struct sbuf descrsb;
600 struct socket *so;
601 int len, overcount;
602 u_int qlen;
603 const char localprefix[] = "local:";
604 char descrbuf[SUNPATHLEN + sizeof(localprefix)];
605#if defined(INET6)
606 char addrbuf[INET6_ADDRSTRLEN];
607#elif defined(INET)
608 char addrbuf[INET_ADDRSTRLEN];
609#endif
610 bool dolog, over;
611
612 SOLISTEN_LOCK(head);
613 over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
614#ifdef REGRESSION
615 if (regression_sonewconn_earlytest && over) {
616#else
617 if (over) {
618#endif
619 head->sol_overcount++;
620 dolog = !!ratecheck(&head->sol_lastover, &overinterval);
621
622 /*
623 * If we're going to log, copy the overflow count and queue
624 * length from the listen socket before dropping the lock.
625 * Also, reset the overflow count.
626 */
627 if (dolog) {
628 overcount = head->sol_overcount;
629 head->sol_overcount = 0;
630 qlen = head->sol_qlen;
631 }
632 SOLISTEN_UNLOCK(head);
633
634 if (dolog) {
635 /*
636 * Try to print something descriptive about the
637 * socket for the error message.
638 */
639 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
640 SBUF_FIXEDLEN);
641 switch (head->so_proto->pr_domain->dom_family) {
642#if defined(INET) || defined(INET6)
643#ifdef INET
644 case AF_INET:
645#endif
646#ifdef INET6
647 case AF_INET6:
648 if (head->so_proto->pr_domain->dom_family ==
649 AF_INET6 ||
650 (sotoinpcb(head)->inp_inc.inc_flags &
651 INC_ISIPV6)) {
652 ip6_sprintf(addrbuf,
653 &sotoinpcb(head)->inp_inc.inc6_laddr);
654 sbuf_printf(&descrsb, "[%s]", addrbuf);
655 } else
656#endif
657 {
658#ifdef INET
659 inet_ntoa_r(
660 sotoinpcb(head)->inp_inc.inc_laddr,
661 addrbuf);
662 sbuf_cat(&descrsb, addrbuf);
663#endif
664 }
665 sbuf_printf(&descrsb, ":%hu (proto %u)",
666 ntohs(sotoinpcb(head)->inp_inc.inc_lport),
667 head->so_proto->pr_protocol);
668 break;
669#endif /* INET || INET6 */
670 case AF_UNIX:
671 sbuf_cat(&descrsb, localprefix);
672 if (sotounpcb(head)->unp_addr != NULL)
673 len =
674 sotounpcb(head)->unp_addr->sun_len -
675 offsetof(struct sockaddr_un,
676 sun_path);
677 else
678 len = 0;
679 if (len > 0)
680 sbuf_bcat(&descrsb,
681 sotounpcb(head)->unp_addr->sun_path,
682 len);
683 else
684 sbuf_cat(&descrsb, "(unknown)");
685 break;
686 }
687
688 /*
689 * If we can't print something more specific, at least
690 * print the domain name.
691 */
692 if (sbuf_finish(&descrsb) != 0 ||
693 sbuf_len(&descrsb) <= 0) {
694 sbuf_clear(&descrsb);
695 sbuf_cat(&descrsb,
696 head->so_proto->pr_domain->dom_name ?:
697 "unknown");
698 sbuf_finish(&descrsb);
699 }
700 KASSERT(sbuf_len(&descrsb) > 0,
701 ("%s: sbuf creation failed", __func__));
702 log(LOG_DEBUG,
703 "%s: pcb %p (%s): Listen queue overflow: "
704 "%i already in queue awaiting acceptance "
705 "(%d occurrences)\n",
706 __func__, head->so_pcb, sbuf_data(&descrsb),
707 qlen, overcount);
708 sbuf_delete(&descrsb);
709
710 overcount = 0;
711 }
712
713 return (NULL);
714 }
715 SOLISTEN_UNLOCK(head);
716 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
717 __func__, head));
718 so = soalloc(head->so_vnet);
719 if (so == NULL) {
720 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
721 "limit reached or out of memory\n",
722 __func__, head->so_pcb);
723 return (NULL);
724 }
725 so->so_listen = head;
726 so->so_type = head->so_type;
727 so->so_options = head->so_options & ~SO_ACCEPTCONN;
728 so->so_linger = head->so_linger;
729 so->so_state = head->so_state | SS_NOFDREF;
730 so->so_fibnum = head->so_fibnum;
731 so->so_proto = head->so_proto;
732 so->so_cred = crhold(head->so_cred);
733#ifdef MAC
734 mac_socket_newconn(head, so);
735#endif
736 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
738 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
740 VNET_SO_ASSERT(head);
741 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
742 sodealloc(so);
743 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
744 __func__, head->so_pcb);
745 return (NULL);
746 }
747 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
748 sodealloc(so);
749 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
750 __func__, head->so_pcb);
751 return (NULL);
752 }
753 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
754 so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
755 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
756 so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
757 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
758 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
759
760 SOLISTEN_LOCK(head);
761 if (head->sol_accept_filter != NULL)
762 connstatus = 0;
763 so->so_state |= connstatus;
764 soref(head); /* A socket on (in)complete queue refs head. */
765 if (connstatus) {
766 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
767 so->so_qstate = SQ_COMP;
768 head->sol_qlen++;
769 solisten_wakeup(head); /* unlocks */
770 } else {
771 /*
772 * Keep removing sockets from the head until there's room for
773 * us to insert on the tail. In pre-locking revisions, this
774 * was a simple if(), but as we could be racing with other
775 * threads and soabort() requires dropping locks, we must
776 * loop waiting for the condition to be true.
777 */
778 while (head->sol_incqlen > head->sol_qlimit) {
779 struct socket *sp;
780
781 sp = TAILQ_FIRST(&head->sol_incomp);
782 TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
783 head->sol_incqlen--;
784 SOCK_LOCK(sp);
785 sp->so_qstate = SQ_NONE;
786 sp->so_listen = NULL;
787 SOCK_UNLOCK(sp);
788 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */
789 soabort(sp);
790 SOLISTEN_LOCK(head);
791 }
792 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
793 so->so_qstate = SQ_INCOMP;
794 head->sol_incqlen++;
795 SOLISTEN_UNLOCK(head);
796 }
797 return (so);
798}
799
800#if defined(SCTP) || defined(SCTP_SUPPORT)
801/*
802 * Socket part of sctp_peeloff(). Detach a new socket from an
803 * association. The new socket is returned with a reference.
804 */
805struct socket *
806sopeeloff(struct socket *head)
807{
808 struct socket *so;
809
810 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
811 __func__, __LINE__, head));
812 so = soalloc(head->so_vnet);
813 if (so == NULL) {
814 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
815 "limit reached or out of memory\n",
816 __func__, head->so_pcb);
817 return (NULL);
818 }
819 so->so_type = head->so_type;
820 so->so_options = head->so_options;
821 so->so_linger = head->so_linger;
822 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
823 so->so_fibnum = head->so_fibnum;
824 so->so_proto = head->so_proto;
825 so->so_cred = crhold(head->so_cred);
826#ifdef MAC
827 mac_socket_newconn(head, so);
828#endif
829 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
831 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
833 VNET_SO_ASSERT(head);
834 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
835 sodealloc(so);
836 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
837 __func__, head->so_pcb);
838 return (NULL);
839 }
840 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
841 sodealloc(so);
842 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
843 __func__, head->so_pcb);
844 return (NULL);
845 }
846 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
847 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
848 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
849 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
850 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
851 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
852
853 soref(so);
854
855 return (so);
856}
857#endif /* SCTP */
858
859int
860sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
861{
862 int error;
863
864 CURVNET_SET(so->so_vnet);
865 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
866 CURVNET_RESTORE();
867 return (error);
868}
869
870int
871sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
872{
873 int error;
874
875 CURVNET_SET(so->so_vnet);
876 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
877 CURVNET_RESTORE();
878 return (error);
879}
880
881/*
882 * solisten() transitions a socket from a non-listening state to a listening
883 * state, but can also be used to update the listen queue depth on an
884 * existing listen socket. The protocol will call back into the sockets
885 * layer using solisten_proto_check() and solisten_proto() to check and set
886 * socket-layer listen state. Call backs are used so that the protocol can
887 * acquire both protocol and socket layer locks in whatever order is required
888 * by the protocol.
889 *
890 * Protocol implementors are advised to hold the socket lock across the
891 * socket-layer test and set to avoid races at the socket layer.
892 */
893int
894solisten(struct socket *so, int backlog, struct thread *td)
895{
896 int error;
897
898 CURVNET_SET(so->so_vnet);
899 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
900 CURVNET_RESTORE();
901 return (error);
902}
903
904/*
905 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in
906 * order to interlock with socket I/O.
907 */
908int
909solisten_proto_check(struct socket *so)
910{
911 SOCK_LOCK_ASSERT(so);
912
913 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
914 SS_ISDISCONNECTING)) != 0)
915 return (EINVAL);
916
917 /*
918 * Sleeping is not permitted here, so simply fail if userspace is
919 * attempting to transmit or receive on the socket. This kind of
920 * transient failure is not ideal, but it should occur only if userspace
921 * is misusing the socket interfaces.
922 */
923 if (!sx_try_xlock(&so->so_snd_sx))
924 return (EAGAIN);
925 if (!sx_try_xlock(&so->so_rcv_sx)) {
926 sx_xunlock(&so->so_snd_sx);
927 return (EAGAIN);
928 }
929 mtx_lock(&so->so_snd_mtx);
930 mtx_lock(&so->so_rcv_mtx);
931
932 /* Interlock with soo_aio_queue(). */
933 if ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
934 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) {
936 return (EINVAL);
937 }
938 return (0);
939}
940
941/*
942 * Undo the setup done by solisten_proto_check().
943 */
944void
945solisten_proto_abort(struct socket *so)
946{
947 mtx_unlock(&so->so_snd_mtx);
948 mtx_unlock(&so->so_rcv_mtx);
949 sx_xunlock(&so->so_snd_sx);
950 sx_xunlock(&so->so_rcv_sx);
951}
952
953void
954solisten_proto(struct socket *so, int backlog)
955{
956 int sbrcv_lowat, sbsnd_lowat;
957 u_int sbrcv_hiwat, sbsnd_hiwat;
958 short sbrcv_flags, sbsnd_flags;
959 sbintime_t sbrcv_timeo, sbsnd_timeo;
960
961 SOCK_LOCK_ASSERT(so);
962 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
963 SS_ISDISCONNECTING)) == 0,
964 ("%s: bad socket state %p", __func__, so));
965
966 if (SOLISTENING(so))
967 goto listening;
968
969 /*
970 * Change this socket to listening state.
971 */
972 sbrcv_lowat = so->so_rcv.sb_lowat;
973 sbsnd_lowat = so->so_snd.sb_lowat;
974 sbrcv_hiwat = so->so_rcv.sb_hiwat;
975 sbsnd_hiwat = so->so_snd.sb_hiwat;
976 sbrcv_flags = so->so_rcv.sb_flags;
977 sbsnd_flags = so->so_snd.sb_flags;
978 sbrcv_timeo = so->so_rcv.sb_timeo;
979 sbsnd_timeo = so->so_snd.sb_timeo;
980
981 sbdestroy(&so->so_snd, so);
982 sbdestroy(&so->so_rcv, so);
983
984#ifdef INVARIANTS
985 bzero(&so->so_rcv,
986 sizeof(struct socket) - offsetof(struct socket, so_rcv));
987#endif
988
989 so->sol_sbrcv_lowat = sbrcv_lowat;
990 so->sol_sbsnd_lowat = sbsnd_lowat;
991 so->sol_sbrcv_hiwat = sbrcv_hiwat;
992 so->sol_sbsnd_hiwat = sbsnd_hiwat;
993 so->sol_sbrcv_flags = sbrcv_flags;
994 so->sol_sbsnd_flags = sbsnd_flags;
995 so->sol_sbrcv_timeo = sbrcv_timeo;
996 so->sol_sbsnd_timeo = sbsnd_timeo;
997
998 so->sol_qlen = so->sol_incqlen = 0;
999 TAILQ_INIT(&so->sol_incomp);
1000 TAILQ_INIT(&so->sol_comp);
1001
1002 so->sol_accept_filter = NULL;
1003 so->sol_accept_filter_arg = NULL;
1004 so->sol_accept_filter_str = NULL;
1005
1006 so->sol_upcall = NULL;
1007 so->sol_upcallarg = NULL;
1008
1009 so->so_options |= SO_ACCEPTCONN;
1010
1011listening:
1012 if (backlog < 0 || backlog > somaxconn)
1013 backlog = somaxconn;
1014 so->sol_qlimit = backlog;
1015
1016 mtx_unlock(&so->so_snd_mtx);
1017 mtx_unlock(&so->so_rcv_mtx);
1018 sx_xunlock(&so->so_snd_sx);
1019 sx_xunlock(&so->so_rcv_sx);
1020}
1021
1022/*
1023 * Wakeup listeners/subsystems once we have a complete connection.
1024 * Enters with lock, returns unlocked.
1025 */
1026void
1027solisten_wakeup(struct socket *sol)
1028{
1029
1030 if (sol->sol_upcall != NULL)
1031 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
1032 else {
1033 selwakeuppri(&sol->so_rdsel, PSOCK);
1034 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
1035 }
1036 SOLISTEN_UNLOCK(sol);
1037 wakeup_one(&sol->sol_comp);
1038 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
1039 pgsigio(&sol->so_sigio, SIGIO, 0);
1040}
1041
1042/*
1043 * Return single connection off a listening socket queue. Main consumer of
1044 * the function is kern_accept4(). Some modules, that do their own accept
1045 * management also use the function.
1046 *
1047 * Listening socket must be locked on entry and is returned unlocked on
1048 * return.
1049 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1050 */
1051int
1052solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1053{
1054 struct socket *so;
1055 int error;
1056
1057 SOLISTEN_LOCK_ASSERT(head);
1058
1059 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1060 head->so_error == 0) {
1061 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1062 "accept", 0);
1063 if (error != 0) {
1064 SOLISTEN_UNLOCK(head);
1065 return (error);
1066 }
1067 }
1068 if (head->so_error) {
1069 error = head->so_error;
1070 head->so_error = 0;
1071 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1072 error = EWOULDBLOCK;
1073 else
1074 error = 0;
1075 if (error) {
1076 SOLISTEN_UNLOCK(head);
1077 return (error);
1078 }
1079 so = TAILQ_FIRST(&head->sol_comp);
1080 SOCK_LOCK(so);
1081 KASSERT(so->so_qstate == SQ_COMP,
1082 ("%s: so %p not SQ_COMP", __func__, so));
1083 soref(so);
1084 head->sol_qlen--;
1085 so->so_qstate = SQ_NONE;
1086 so->so_listen = NULL;
1087 TAILQ_REMOVE(&head->sol_comp, so, so_list);
1088 if (flags & ACCEPT4_INHERIT)
1089 so->so_state |= (head->so_state & SS_NBIO);
1090 else
1091 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1092 SOCK_UNLOCK(so);
1093 sorele_locked(head);
1094
1095 *ret = so;
1096 return (0);
1097}
1098
1099/*
1100 * Evaluate the reference count and named references on a socket; if no
1101 * references remain, free it. This should be called whenever a reference is
1102 * released, such as in sorele(), but also when named reference flags are
1103 * cleared in socket or protocol code.
1104 *
1105 * sofree() will free the socket if:
1106 *
1107 * - There are no outstanding file descriptor references or related consumers
1108 * (so_count == 0).
1109 *
1110 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
1111 *
1112 * - The protocol does not have an outstanding strong reference on the socket
1113 * (SS_PROTOREF).
1114 *
1115 * - The socket is not in a completed connection queue, so a process has been
1116 * notified that it is present. If it is removed, the user process may
1117 * block in accept() despite select() saying the socket was ready.
1118 */
1119void
1120sofree(struct socket *so)
1121{
1122 struct protosw *pr = so->so_proto;
1123 bool last __diagused;
1124
1125 SOCK_LOCK_ASSERT(so);
1126
1127 if ((so->so_state & (SS_NOFDREF | SS_PROTOREF)) != SS_NOFDREF ||
1128 refcount_load(&so->so_count) != 0 || so->so_qstate == SQ_COMP) {
1129 SOCK_UNLOCK(so);
1130 return;
1131 }
1132
1133 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
1134 struct socket *sol;
1135
1136 sol = so->so_listen;
1137 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
1138
1139 /*
1140 * To solve race between close of a listening socket and
1141 * a socket on its incomplete queue, we need to lock both.
1142 * The order is first listening socket, then regular.
1143 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
1144 * function and the listening socket are the only pointers
1145 * to so. To preserve so and sol, we reference both and then
1146 * relock.
1147 * After relock the socket may not move to so_comp since it
1148 * doesn't have PCB already, but it may be removed from
1149 * so_incomp. If that happens, we share responsiblity on
1150 * freeing the socket, but soclose() has already removed
1151 * it from queue.
1152 */
1153 soref(sol);
1154 soref(so);
1155 SOCK_UNLOCK(so);
1156 SOLISTEN_LOCK(sol);
1157 SOCK_LOCK(so);
1158 if (so->so_qstate == SQ_INCOMP) {
1159 KASSERT(so->so_listen == sol,
1160 ("%s: so %p migrated out of sol %p",
1161 __func__, so, sol));
1162 TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
1163 sol->sol_incqlen--;
1164 last = refcount_release(&sol->so_count);
1165 KASSERT(!last, ("%s: released last reference for %p",
1166 __func__, sol));
1167 so->so_qstate = SQ_NONE;
1168 so->so_listen = NULL;
1169 } else
1170 KASSERT(so->so_listen == NULL,
1171 ("%s: so %p not on (in)comp with so_listen",
1172 __func__, so));
1173 sorele_locked(sol);
1174 KASSERT(refcount_load(&so->so_count) == 1,
1175 ("%s: so %p count %u", __func__, so, so->so_count));
1176 so->so_count = 0;
1177 }
1178 if (SOLISTENING(so))
1179 so->so_error = ECONNABORTED;
1180 SOCK_UNLOCK(so);
1181
1182 if (so->so_dtor != NULL)
1183 so->so_dtor(so);
1184
1185 VNET_SO_ASSERT(so);
1186 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1187 (*pr->pr_domain->dom_dispose)(so);
1188 if (pr->pr_usrreqs->pru_detach != NULL)
1189 (*pr->pr_usrreqs->pru_detach)(so);
1190
1191 /*
1192 * From this point on, we assume that no other references to this
1193 * socket exist anywhere else in the stack. Therefore, no locks need
1194 * to be acquired or held.
1195 *
1196 * We used to do a lot of socket buffer and socket locking here, as
1197 * well as invoke sorflush() and perform wakeups. The direct call to
1198 * dom_dispose() and sbdestroy() are an inlining of what was
1199 * necessary from sorflush().
1200 *
1201 * Notice that the socket buffer and kqueue state are torn down
1202 * before calling pru_detach. This means that protocols shold not
1203 * assume they can perform socket wakeups, etc, in their detach code.
1204 */
1205 if (!SOLISTENING(so)) {
1206 sbdestroy(&so->so_snd, so);
1207 sbdestroy(&so->so_rcv, so);
1208 }
1209 seldrain(&so->so_rdsel);
1210 seldrain(&so->so_wrsel);
1211 knlist_destroy(&so->so_rdsel.si_note);
1212 knlist_destroy(&so->so_wrsel.si_note);
1213 sodealloc(so);
1214}
1215
1216/*
1217 * Release a reference on a socket while holding the socket lock.
1218 * Unlocks the socket lock before returning.
1219 */
1220void
1221sorele_locked(struct socket *so)
1222{
1223 SOCK_LOCK_ASSERT(so);
1224 if (refcount_release(&so->so_count))
1225 sofree(so);
1226 else
1227 SOCK_UNLOCK(so);
1228}
1229
1230/*
1231 * Close a socket on last file table reference removal. Initiate disconnect
1232 * if connected. Free socket when disconnect complete.
1233 *
1234 * This function will sorele() the socket. Note that soclose() may be called
1235 * prior to the ref count reaching zero. The actual socket structure will
1236 * not be freed until the ref count reaches zero.
1237 */
1238int
1239soclose(struct socket *so)
1240{
1241 struct accept_queue lqueue;
1242 int error = 0;
1243 bool listening, last __diagused;
1244
1245 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
1246
1247 CURVNET_SET(so->so_vnet);
1248 funsetown(&so->so_sigio);
1249 if (so->so_state & SS_ISCONNECTED) {
1250 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1251 error = sodisconnect(so);
1252 if (error) {
1253 if (error == ENOTCONN)
1254 error = 0;
1255 goto drop;
1256 }
1257 }
1258
1259 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1260 if ((so->so_state & SS_ISDISCONNECTING) &&
1261 (so->so_state & SS_NBIO))
1262 goto drop;
1263 while (so->so_state & SS_ISCONNECTED) {
1264 error = tsleep(&so->so_timeo,
1265 PSOCK | PCATCH, "soclos",
1266 so->so_linger * hz);
1267 if (error)
1268 break;
1269 }
1270 }
1271 }
1272
1273drop:
1274 if (so->so_proto->pr_usrreqs->pru_close != NULL)
1275 (*so->so_proto->pr_usrreqs->pru_close)(so);
1276
1277 SOCK_LOCK(so);
1278 if ((listening = SOLISTENING(so))) {
1279 struct socket *sp;
1280
1281 TAILQ_INIT(&lqueue);
1282 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1283 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1284
1285 so->sol_qlen = so->sol_incqlen = 0;
1286
1287 TAILQ_FOREACH(sp, &lqueue, so_list) {
1288 SOCK_LOCK(sp);
1289 sp->so_qstate = SQ_NONE;
1290 sp->so_listen = NULL;
1291 SOCK_UNLOCK(sp);
1292 last = refcount_release(&so->so_count);
1293 KASSERT(!last, ("%s: released last reference for %p",
1294 __func__, so));
1295 }
1296 }
1297 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
1298 so->so_state |= SS_NOFDREF;
1299 sorele_locked(so);
1300 if (listening) {
1301 struct socket *sp, *tsp;
1302
1303 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
1304 SOCK_LOCK(sp);
1305 if (refcount_load(&sp->so_count) == 0) {
1306 SOCK_UNLOCK(sp);
1307 soabort(sp);
1308 } else {
1309 /* See the handling of queued sockets
1310 in sofree(). */
1311 SOCK_UNLOCK(sp);
1312 }
1313 }
1314 }
1315 CURVNET_RESTORE();
1316 return (error);
1317}
1318
1319/*
1320 * soabort() is used to abruptly tear down a connection, such as when a
1321 * resource limit is reached (listen queue depth exceeded), or if a listen
1322 * socket is closed while there are sockets waiting to be accepted.
1323 *
1324 * This interface is tricky, because it is called on an unreferenced socket,
1325 * and must be called only by a thread that has actually removed the socket
1326 * from the listen queue it was on, or races with other threads are risked.
1327 *
1328 * This interface will call into the protocol code, so must not be called
1329 * with any socket locks held. Protocols do call it while holding their own
1330 * recursible protocol mutexes, but this is something that should be subject
1331 * to review in the future.
1332 */
1333void
1334soabort(struct socket *so)
1335{
1336
1337 /*
1338 * In as much as is possible, assert that no references to this
1339 * socket are held. This is not quite the same as asserting that the
1340 * current thread is responsible for arranging for no references, but
1341 * is as close as we can get for now.
1342 */
1343 KASSERT(so->so_count == 0, ("soabort: so_count"));
1344 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
1345 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
1346 VNET_SO_ASSERT(so);
1347
1348 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
1349 (*so->so_proto->pr_usrreqs->pru_abort)(so);
1350 SOCK_LOCK(so);
1351 sofree(so);
1352}
1353
1354int
1355soaccept(struct socket *so, struct sockaddr **nam)
1356{
1357 int error;
1358
1359 SOCK_LOCK(so);
1360 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
1361 so->so_state &= ~SS_NOFDREF;
1362 SOCK_UNLOCK(so);
1363
1364 CURVNET_SET(so->so_vnet);
1365 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1366 CURVNET_RESTORE();
1367 return (error);
1368}
1369
1370int
1371soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1372{
1373
1374 return (soconnectat(AT_FDCWD, so, nam, td));
1375}
1376
1377int
1378soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1379{
1380 int error;
1381
1382 CURVNET_SET(so->so_vnet);
1383 /*
1384 * If protocol is connection-based, can only connect once.
1385 * Otherwise, if connected, try to disconnect first. This allows
1386 * user to disconnect by connecting to, e.g., a null address.
1387 */
1388 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1389 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1390 (error = sodisconnect(so)))) {
1391 error = EISCONN;
1392 } else {
1393 /*
1394 * Prevent accumulated error from previous connection from
1395 * biting us.
1396 */
1397 so->so_error = 0;
1398 if (fd == AT_FDCWD) {
1399 error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1400 nam, td);
1401 } else {
1402 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1403 so, nam, td);
1404 }
1405 }
1406 CURVNET_RESTORE();
1407
1408 return (error);
1409}
1410
1411int
1412soconnect2(struct socket *so1, struct socket *so2)
1413{
1414 int error;
1415
1416 CURVNET_SET(so1->so_vnet);
1417 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1418 CURVNET_RESTORE();
1419 return (error);
1420}
1421
1422int
1423sodisconnect(struct socket *so)
1424{
1425 int error;
1426
1427 if ((so->so_state & SS_ISCONNECTED) == 0)
1428 return (ENOTCONN);
1429 if (so->so_state & SS_ISDISCONNECTING)
1430 return (EALREADY);
1431 VNET_SO_ASSERT(so);
1432 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1433 return (error);
1434}
1435
1436int
1437sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1438 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1439{
1440 long space;
1441 ssize_t resid;
1442 int clen = 0, error, dontroute;
1443
1444 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1445 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1446 ("sosend_dgram: !PR_ATOMIC"));
1447
1448 if (uio != NULL)
1449 resid = uio->uio_resid;
1450 else
1451 resid = top->m_pkthdr.len;
1452 /*
1453 * In theory resid should be unsigned. However, space must be
1454 * signed, as it might be less than 0 if we over-committed, and we
1455 * must use a signed comparison of space and resid. On the other
1456 * hand, a negative resid causes us to loop sending 0-length
1457 * segments to the protocol.
1458 */
1459 if (resid < 0) {
1460 error = EINVAL;
1461 goto out;
1462 }
1463
1464 dontroute =
1465 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1466 if (td != NULL)
1467 td->td_ru.ru_msgsnd++;
1468 if (control != NULL)
1469 clen = control->m_len;
1470
1471 SOCKBUF_LOCK(&so->so_snd);
1472 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1473 SOCKBUF_UNLOCK(&so->so_snd);
1474 error = EPIPE;
1475 goto out;
1476 }
1477 if (so->so_error) {
1478 error = so->so_error;
1479 so->so_error = 0;
1480 SOCKBUF_UNLOCK(&so->so_snd);
1481 goto out;
1482 }
1483 if ((so->so_state & SS_ISCONNECTED) == 0) {
1484 /*
1485 * `sendto' and `sendmsg' is allowed on a connection-based
1486 * socket if it supports implied connect. Return ENOTCONN if
1487 * not connected and no address is supplied.
1488 */
1489 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1490 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1491 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1492 !(resid == 0 && clen != 0)) {
1493 SOCKBUF_UNLOCK(&so->so_snd);
1494 error = ENOTCONN;
1495 goto out;
1496 }
1497 } else if (addr == NULL) {
1498 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1499 error = ENOTCONN;
1500 else
1501 error = EDESTADDRREQ;
1502 SOCKBUF_UNLOCK(&so->so_snd);
1503 goto out;
1504 }
1505 }
1506
1507 /*
1508 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1509 * problem and need fixing.
1510 */
1511 space = sbspace(&so->so_snd);
1512 if (flags & MSG_OOB)
1513 space += 1024;
1514 space -= clen;
1515 SOCKBUF_UNLOCK(&so->so_snd);
1516 if (resid > space) {
1517 error = EMSGSIZE;
1518 goto out;
1519 }
1520 if (uio == NULL) {
1521 resid = 0;
1522 if (flags & MSG_EOR)
1523 top->m_flags |= M_EOR;
1524 } else {
1525 /*
1526 * Copy the data from userland into a mbuf chain.
1527 * If no data is to be copied in, a single empty mbuf
1528 * is returned.
1529 */
1530 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1531 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1532 if (top == NULL) {
1533 error = EFAULT; /* only possible error */
1534 goto out;
1535 }
1536 space -= resid - uio->uio_resid;
1537 resid = uio->uio_resid;
1538 }
1539 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1540 /*
1541 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1542 * than with.
1543 */
1544 if (dontroute) {
1545 SOCK_LOCK(so);
1546 so->so_options |= SO_DONTROUTE;
1547 SOCK_UNLOCK(so);
1548 }
1549 /*
1550 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1551 * of date. We could have received a reset packet in an interrupt or
1552 * maybe we slept while doing page faults in uiomove() etc. We could
1553 * probably recheck again inside the locking protection here, but
1554 * there are probably other places that this also happens. We must
1555 * rethink this.
1556 */
1557 VNET_SO_ASSERT(so);
1558 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1559 (flags & MSG_OOB) ? PRUS_OOB :
1560 /*
1561 * If the user set MSG_EOF, the protocol understands this flag and
1562 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1563 */
1564 ((flags & MSG_EOF) &&
1565 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1566 (resid <= 0)) ?
1567 PRUS_EOF :
1568 /* If there is more to send set PRUS_MORETOCOME */
1569 (flags & MSG_MORETOCOME) ||
1570 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1571 top, addr, control, td);
1572 if (dontroute) {
1573 SOCK_LOCK(so);
1574 so->so_options &= ~SO_DONTROUTE;
1575 SOCK_UNLOCK(so);
1576 }
1577 clen = 0;
1578 control = NULL;
1579 top = NULL;
1580out:
1581 if (top != NULL)
1582 m_freem(top);
1583 if (control != NULL)
1584 m_freem(control);
1585 return (error);
1586}
1587
1588/*
1589 * Send on a socket. If send must go all at once and message is larger than
1590 * send buffering, then hard error. Lock against other senders. If must go
1591 * all at once and not enough room now, then inform user that this would
1592 * block and do nothing. Otherwise, if nonblocking, send as much as
1593 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1594 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1595 * in mbuf chain must be small enough to send all at once.
1596 *
1597 * Returns nonzero on error, timeout or signal; callers must check for short
1598 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1599 * on return.
1600 */
1601int
1602sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1603 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1604{
1605 long space;
1606 ssize_t resid;
1607 int clen = 0, error, dontroute;
1608 int atomic = sosendallatonce(so) || top;
1609 int pru_flag;
1610#ifdef KERN_TLS
1611 struct ktls_session *tls;
1612 int tls_enq_cnt, tls_pruflag;
1613 uint8_t tls_rtype;
1614
1615 tls = NULL;
1616 tls_rtype = TLS_RLTYPE_APP;
1617#endif
1618 if (uio != NULL)
1619 resid = uio->uio_resid;
1620 else if ((top->m_flags & M_PKTHDR) != 0)
1621 resid = top->m_pkthdr.len;
1622 else
1623 resid = m_length(top, NULL);
1624 /*
1625 * In theory resid should be unsigned. However, space must be
1626 * signed, as it might be less than 0 if we over-committed, and we
1627 * must use a signed comparison of space and resid. On the other
1628 * hand, a negative resid causes us to loop sending 0-length
1629 * segments to the protocol.
1630 *
1631 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1632 * type sockets since that's an error.
1633 */
1634 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1635 error = EINVAL;
1636 goto out;
1637 }
1638
1639 dontroute =
1640 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1641 (so->so_proto->pr_flags & PR_ATOMIC);
1642 if (td != NULL)
1643 td->td_ru.ru_msgsnd++;
1644 if (control != NULL)
1645 clen = control->m_len;
1646
1647 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1648 if (error)
1649 goto out;
1650
1651#ifdef KERN_TLS
1652 tls_pruflag = 0;
1653 tls = ktls_hold(so->so_snd.sb_tls_info);
1654 if (tls != NULL) {
1655 if (tls->mode == TCP_TLS_MODE_SW)
1656 tls_pruflag = PRUS_NOTREADY;
1657
1658 if (control != NULL) {
1659 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1660
1661 if (clen >= sizeof(*cm) &&
1662 cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1663 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1664 clen = 0;
1665 m_freem(control);
1666 control = NULL;
1667 atomic = 1;
1668 }
1669 }
1670
1671 if (resid == 0 && !ktls_permit_empty_frames(tls)) {
1672 error = EINVAL;
1673 goto release;
1674 }
1675 }
1676#endif
1677
1678restart:
1679 do {
1680 SOCKBUF_LOCK(&so->so_snd);
1681 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1682 SOCKBUF_UNLOCK(&so->so_snd);
1683 error = EPIPE;
1684 goto release;
1685 }
1686 if (so->so_error) {
1687 error = so->so_error;
1688 so->so_error = 0;
1689 SOCKBUF_UNLOCK(&so->so_snd);
1690 goto release;
1691 }
1692 if ((so->so_state & SS_ISCONNECTED) == 0) {
1693 /*
1694 * `sendto' and `sendmsg' is allowed on a connection-
1695 * based socket if it supports implied connect.
1696 * Return ENOTCONN if not connected and no address is
1697 * supplied.
1698 */
1699 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1700 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1701 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1702 !(resid == 0 && clen != 0)) {
1703 SOCKBUF_UNLOCK(&so->so_snd);
1704 error = ENOTCONN;
1705 goto release;
1706 }
1707 } else if (addr == NULL) {
1708 SOCKBUF_UNLOCK(&so->so_snd);
1709 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1710 error = ENOTCONN;
1711 else
1712 error = EDESTADDRREQ;
1713 goto release;
1714 }
1715 }
1716 space = sbspace(&so->so_snd);
1717 if (flags & MSG_OOB)
1718 space += 1024;
1719 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1720 clen > so->so_snd.sb_hiwat) {
1721 SOCKBUF_UNLOCK(&so->so_snd);
1722 error = EMSGSIZE;
1723 goto release;
1724 }
1725 if (space < resid + clen &&
1726 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1727 if ((so->so_state & SS_NBIO) ||
1728 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1729 SOCKBUF_UNLOCK(&so->so_snd);
1730 error = EWOULDBLOCK;
1731 goto release;
1732 }
1733 error = sbwait(&so->so_snd);
1734 SOCKBUF_UNLOCK(&so->so_snd);
1735 if (error)
1736 goto release;
1737 goto restart;
1738 }
1739 SOCKBUF_UNLOCK(&so->so_snd);
1740 space -= clen;
1741 do {
1742 if (uio == NULL) {
1743 resid = 0;
1744 if (flags & MSG_EOR)
1745 top->m_flags |= M_EOR;
1746#ifdef KERN_TLS
1747 if (tls != NULL) {
1748 ktls_frame(top, tls, &tls_enq_cnt,
1749 tls_rtype);
1750 tls_rtype = TLS_RLTYPE_APP;
1751 }
1752#endif
1753 } else {
1754 /*
1755 * Copy the data from userland into a mbuf
1756 * chain. If resid is 0, which can happen
1757 * only if we have control to send, then
1758 * a single empty mbuf is returned. This
1759 * is a workaround to prevent protocol send
1760 * methods to panic.
1761 */
1762#ifdef KERN_TLS
1763 if (tls != NULL) {
1764 top = m_uiotombuf(uio, M_WAITOK, space,
1765 tls->params.max_frame_len,
1766 M_EXTPG |
1767 ((flags & MSG_EOR) ? M_EOR : 0));
1768 if (top != NULL) {
1769 ktls_frame(top, tls,
1770 &tls_enq_cnt, tls_rtype);
1771 }
1772 tls_rtype = TLS_RLTYPE_APP;
1773 } else
1774#endif
1775 top = m_uiotombuf(uio, M_WAITOK, space,
1776 (atomic ? max_hdr : 0),
1777 (atomic ? M_PKTHDR : 0) |
1778 ((flags & MSG_EOR) ? M_EOR : 0));
1779 if (top == NULL) {
1780 error = EFAULT; /* only possible error */
1781 goto release;
1782 }
1783 space -= resid - uio->uio_resid;
1784 resid = uio->uio_resid;
1785 }
1786 if (dontroute) {
1787 SOCK_LOCK(so);
1788 so->so_options |= SO_DONTROUTE;
1789 SOCK_UNLOCK(so);
1790 }
1791 /*
1792 * XXX all the SBS_CANTSENDMORE checks previously
1793 * done could be out of date. We could have received
1794 * a reset packet in an interrupt or maybe we slept
1795 * while doing page faults in uiomove() etc. We
1796 * could probably recheck again inside the locking
1797 * protection here, but there are probably other
1798 * places that this also happens. We must rethink
1799 * this.
1800 */
1801 VNET_SO_ASSERT(so);
1802
1803 pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
1804 /*
1805 * If the user set MSG_EOF, the protocol understands
1806 * this flag and nothing left to send then use
1807 * PRU_SEND_EOF instead of PRU_SEND.
1808 */
1809 ((flags & MSG_EOF) &&
1810 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1811 (resid <= 0)) ?
1812 PRUS_EOF :
1813 /* If there is more to send set PRUS_MORETOCOME. */
1814 (flags & MSG_MORETOCOME) ||
1815 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1816
1817#ifdef KERN_TLS
1818 pru_flag |= tls_pruflag;
1819#endif
1820
1821 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1822 pru_flag, top, addr, control, td);
1823
1824 if (dontroute) {
1825 SOCK_LOCK(so);
1826 so->so_options &= ~SO_DONTROUTE;
1827 SOCK_UNLOCK(so);
1828 }
1829
1830#ifdef KERN_TLS
1831 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1832 if (error != 0) {
1833 m_freem(top);
1834 top = NULL;
1835 } else {
1836 soref(so);
1837 ktls_enqueue(top, so, tls_enq_cnt);
1838 }
1839 }
1840#endif
1841 clen = 0;
1842 control = NULL;
1843 top = NULL;
1844 if (error)
1845 goto release;
1846 } while (resid && space > 0);
1847 } while (resid);
1848
1849release:
1850 SOCK_IO_SEND_UNLOCK(so);
1851out:
1852#ifdef KERN_TLS
1853 if (tls != NULL)
1854 ktls_free(tls);
1855#endif
1856 if (top != NULL)
1857 m_freem(top);
1858 if (control != NULL)
1859 m_freem(control);
1860 return (error);
1861}
1862
1863int
1864sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1865 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1866{
1867 int error;
1868
1869 CURVNET_SET(so->so_vnet);
1870 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
1871 top, control, flags, td);
1872 CURVNET_RESTORE();
1873 return (error);
1874}
1875
1876/*
1877 * The part of soreceive() that implements reading non-inline out-of-band
1878 * data from a socket. For more complete comments, see soreceive(), from
1879 * which this code originated.
1880 *
1881 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1882 * unable to return an mbuf chain to the caller.
1883 */
1884static int
1885soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1886{
1887 struct protosw *pr = so->so_proto;
1888 struct mbuf *m;
1889 int error;
1890
1891 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1892 VNET_SO_ASSERT(so);
1893
1894 m = m_get(M_WAITOK, MT_DATA);
1895 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1896 if (error)
1897 goto bad;
1898 do {
1899 error = uiomove(mtod(m, void *),
1900 (int) min(uio->uio_resid, m->m_len), uio);
1901 m = m_free(m);
1902 } while (uio->uio_resid && error == 0 && m);
1903bad:
1904 if (m != NULL)
1905 m_freem(m);
1906 return (error);
1907}
1908
1909/*
1910 * Following replacement or removal of the first mbuf on the first mbuf chain
1911 * of a socket buffer, push necessary state changes back into the socket
1912 * buffer so that other consumers see the values consistently. 'nextrecord'
1913 * is the callers locally stored value of the original value of
1914 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1915 * NOTE: 'nextrecord' may be NULL.
1916 */
1917static __inline void
1918sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1919{
1920
1921 SOCKBUF_LOCK_ASSERT(sb);
1922 /*
1923 * First, update for the new value of nextrecord. If necessary, make
1924 * it the first record.
1925 */
1926 if (sb->sb_mb != NULL)
1927 sb->sb_mb->m_nextpkt = nextrecord;
1928 else
1929 sb->sb_mb = nextrecord;
1930
1931 /*
1932 * Now update any dependent socket buffer fields to reflect the new
1933 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1934 * addition of a second clause that takes care of the case where
1935 * sb_mb has been updated, but remains the last record.
1936 */
1937 if (sb->sb_mb == NULL) {
1938 sb->sb_mbtail = NULL;
1939 sb->sb_lastrecord = NULL;
1940 } else if (sb->sb_mb->m_nextpkt == NULL)
1941 sb->sb_lastrecord = sb->sb_mb;
1942}
1943
1944/*
1945 * Implement receive operations on a socket. We depend on the way that
1946 * records are added to the sockbuf by sbappend. In particular, each record
1947 * (mbufs linked through m_next) must begin with an address if the protocol
1948 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1949 * data, and then zero or more mbufs of data. In order to allow parallelism
1950 * between network receive and copying to user space, as well as avoid
1951 * sleeping with a mutex held, we release the socket buffer mutex during the
1952 * user space copy. Although the sockbuf is locked, new data may still be
1953 * appended, and thus we must maintain consistency of the sockbuf during that
1954 * time.
1955 *
1956 * The caller may receive the data as a single mbuf chain by supplying an
1957 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1958 * the count in uio_resid.
1959 */
1960int
1961soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1962 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1963{
1964 struct mbuf *m, **mp;
1965 int flags, error, offset;
1966 ssize_t len;
1967 struct protosw *pr = so->so_proto;
1968 struct mbuf *nextrecord;
1969 int moff, type = 0;
1970 ssize_t orig_resid = uio->uio_resid;
1971
1972 mp = mp0;
1973 if (psa != NULL)
1974 *psa = NULL;
1975 if (controlp != NULL)
1976 *controlp = NULL;
1977 if (flagsp != NULL)
1978 flags = *flagsp &~ MSG_EOR;
1979 else
1980 flags = 0;
1981 if (flags & MSG_OOB)
1982 return (soreceive_rcvoob(so, uio, flags));
1983 if (mp != NULL)
1984 *mp = NULL;
1985 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1986 && uio->uio_resid) {
1987 VNET_SO_ASSERT(so);
1988 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1989 }
1990
1991 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1992 if (error)
1993 return (error);
1994
1995restart:
1996 SOCKBUF_LOCK(&so->so_rcv);
1997 m = so->so_rcv.sb_mb;
1998 /*
1999 * If we have less data than requested, block awaiting more (subject
2000 * to any timeout) if:
2001 * 1. the current count is less than the low water mark, or
2002 * 2. MSG_DONTWAIT is not set
2003 */
2004 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2005 sbavail(&so->so_rcv) < uio->uio_resid) &&
2006 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
2007 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2008 KASSERT(m != NULL || !sbavail(&so->so_rcv),
2009 ("receive: m == %p sbavail == %u",
2010 m, sbavail(&so->so_rcv)));
2011 if (so->so_error || so->so_rerror) {
2012 if (m != NULL)
2013 goto dontblock;
2014 if (so->so_error)
2015 error = so->so_error;
2016 else
2017 error = so->so_rerror;
2018 if ((flags & MSG_PEEK) == 0) {
2019 if (so->so_error)
2020 so->so_error = 0;
2021 else
2022 so->so_rerror = 0;
2023 }
2024 SOCKBUF_UNLOCK(&so->so_rcv);
2025 goto release;
2026 }
2027 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2028 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2029 if (m != NULL)
2030 goto dontblock;
2031#ifdef KERN_TLS
2032 else if (so->so_rcv.sb_tlsdcc == 0 &&
2033 so->so_rcv.sb_tlscc == 0) {
2034#else
2035 else {
2036#endif
2037 SOCKBUF_UNLOCK(&so->so_rcv);
2038 goto release;
2039 }
2040 }
2041 for (; m != NULL; m = m->m_next)
2042 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2043 m = so->so_rcv.sb_mb;
2044 goto dontblock;
2045 }
2046 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
2047 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
2048 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2049 SOCKBUF_UNLOCK(&so->so_rcv);
2050 error = ENOTCONN;
2051 goto release;
2052 }
2053 if (uio->uio_resid == 0) {
2054 SOCKBUF_UNLOCK(&so->so_rcv);
2055 goto release;
2056 }
2057 if ((so->so_state & SS_NBIO) ||
2058 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2059 SOCKBUF_UNLOCK(&so->so_rcv);
2060 error = EWOULDBLOCK;
2061 goto release;
2062 }
2063 SBLASTRECORDCHK(&so->so_rcv);
2064 SBLASTMBUFCHK(&so->so_rcv);
2065 error = sbwait(&so->so_rcv);
2066 SOCKBUF_UNLOCK(&so->so_rcv);
2067 if (error)
2068 goto release;
2069 goto restart;
2070 }
2071dontblock:
2072 /*
2073 * From this point onward, we maintain 'nextrecord' as a cache of the
2074 * pointer to the next record in the socket buffer. We must keep the
2075 * various socket buffer pointers and local stack versions of the
2076 * pointers in sync, pushing out modifications before dropping the
2077 * socket buffer mutex, and re-reading them when picking it up.
2078 *
2079 * Otherwise, we will race with the network stack appending new data
2080 * or records onto the socket buffer by using inconsistent/stale
2081 * versions of the field, possibly resulting in socket buffer
2082 * corruption.
2083 *
2084 * By holding the high-level sblock(), we prevent simultaneous
2085 * readers from pulling off the front of the socket buffer.
2086 */
2087 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2088 if (uio->uio_td)
2089 uio->uio_td->td_ru.ru_msgrcv++;
2090 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2091 SBLASTRECORDCHK(&so->so_rcv);
2092 SBLASTMBUFCHK(&so->so_rcv);
2093 nextrecord = m->m_nextpkt;
2094 if (pr->pr_flags & PR_ADDR) {
2095 KASSERT(m->m_type == MT_SONAME,
2096 ("m->m_type == %d", m->m_type));
2097 orig_resid = 0;
2098 if (psa != NULL)
2099 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2100 M_NOWAIT);
2101 if (flags & MSG_PEEK) {
2102 m = m->m_next;
2103 } else {
2104 sbfree(&so->so_rcv, m);
2105 so->so_rcv.sb_mb = m_free(m);
2106 m = so->so_rcv.sb_mb;
2107 sockbuf_pushsync(&so->so_rcv, nextrecord);
2108 }
2109 }
2110
2111 /*
2112 * Process one or more MT_CONTROL mbufs present before any data mbufs
2113 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2114 * just copy the data; if !MSG_PEEK, we call into the protocol to
2115 * perform externalization (or freeing if controlp == NULL).
2116 */
2117 if (m != NULL && m->m_type == MT_CONTROL) {
2118 struct mbuf *cm = NULL, *cmn;
2119 struct mbuf **cme = &cm;
2120#ifdef KERN_TLS
2121 struct cmsghdr *cmsg;
2122 struct tls_get_record tgr;
2123
2124 /*
2125 * For MSG_TLSAPPDATA, check for a non-application data
2126 * record. If found, return ENXIO without removing
2127 * it from the receive queue. This allows a subsequent
2128 * call without MSG_TLSAPPDATA to receive it.
2129 * Note that, for TLS, there should only be a single
2130 * control mbuf with the TLS_GET_RECORD message in it.
2131 */
2132 if (flags & MSG_TLSAPPDATA) {
2133 cmsg = mtod(m, struct cmsghdr *);
2134 if (cmsg->cmsg_type == TLS_GET_RECORD &&
2135 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2136 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2137 /* This will need to change for TLS 1.3. */
2138 if (tgr.tls_type != TLS_RLTYPE_APP) {
2139 SOCKBUF_UNLOCK(&so->so_rcv);
2140 error = ENXIO;
2141 goto release;
2142 }
2143 }
2144 }
2145#endif
2146
2147 do {
2148 if (flags & MSG_PEEK) {
2149 if (controlp != NULL) {
2150 *controlp = m_copym(m, 0, m->m_len,
2151 M_NOWAIT);
2152 controlp = &(*controlp)->m_next;
2153 }
2154 m = m->m_next;
2155 } else {
2156 sbfree(&so->so_rcv, m);
2157 so->so_rcv.sb_mb = m->m_next;
2158 m->m_next = NULL;
2159 *cme = m;
2160 cme = &(*cme)->m_next;
2161 m = so->so_rcv.sb_mb;
2162 }
2163 } while (m != NULL && m->m_type == MT_CONTROL);
2164 if ((flags & MSG_PEEK) == 0)
2165 sockbuf_pushsync(&so->so_rcv, nextrecord);
2166 while (cm != NULL) {
2167 cmn = cm->m_next;
2168 cm->m_next = NULL;
2169 if (pr->pr_domain->dom_externalize != NULL) {
2170 SOCKBUF_UNLOCK(&so->so_rcv);
2171 VNET_SO_ASSERT(so);
2172 error = (*pr->pr_domain->dom_externalize)
2173 (cm, controlp, flags);
2174 SOCKBUF_LOCK(&so->so_rcv);
2175 } else if (controlp != NULL)
2176 *controlp = cm;
2177 else
2178 m_freem(cm);
2179 if (controlp != NULL) {
2180 while (*controlp != NULL)
2181 controlp = &(*controlp)->m_next;
2182 }
2183 cm = cmn;
2184 }
2185 if (m != NULL)
2186 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2187 else
2188 nextrecord = so->so_rcv.sb_mb;
2189 orig_resid = 0;
2190 }
2191 if (m != NULL) {
2192 if ((flags & MSG_PEEK) == 0) {
2193 KASSERT(m->m_nextpkt == nextrecord,
2194 ("soreceive: post-control, nextrecord !sync"));
2195 if (nextrecord == NULL) {
2196 KASSERT(so->so_rcv.sb_mb == m,
2197 ("soreceive: post-control, sb_mb!=m"));
2198 KASSERT(so->so_rcv.sb_lastrecord == m,
2199 ("soreceive: post-control, lastrecord!=m"));
2200 }
2201 }
2202 type = m->m_type;
2203 if (type == MT_OOBDATA)
2204 flags |= MSG_OOB;
2205 } else {
2206 if ((flags & MSG_PEEK) == 0) {
2207 KASSERT(so->so_rcv.sb_mb == nextrecord,
2208 ("soreceive: sb_mb != nextrecord"));
2209 if (so->so_rcv.sb_mb == NULL) {
2210 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2211 ("soreceive: sb_lastercord != NULL"));
2212 }
2213 }
2214 }
2215 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2216 SBLASTRECORDCHK(&so->so_rcv);
2217 SBLASTMBUFCHK(&so->so_rcv);
2218
2219 /*
2220 * Now continue to read any data mbufs off of the head of the socket
2221 * buffer until the read request is satisfied. Note that 'type' is
2222 * used to store the type of any mbuf reads that have happened so far
2223 * such that soreceive() can stop reading if the type changes, which
2224 * causes soreceive() to return only one of regular data and inline
2225 * out-of-band data in a single socket receive operation.
2226 */
2227 moff = 0;
2228 offset = 0;
2229 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2230 && error == 0) {
2231 /*
2232 * If the type of mbuf has changed since the last mbuf
2233 * examined ('type'), end the receive operation.
2234 */
2235 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2236 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2237 if (type != m->m_type)
2238 break;
2239 } else if (type == MT_OOBDATA)
2240 break;
2241 else
2242 KASSERT(m->m_type == MT_DATA,
2243 ("m->m_type == %d", m->m_type));
2244 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2245 len = uio->uio_resid;
2246 if (so->so_oobmark && len > so->so_oobmark - offset)
2247 len = so->so_oobmark - offset;
2248 if (len > m->m_len - moff)
2249 len = m->m_len - moff;
2250 /*
2251 * If mp is set, just pass back the mbufs. Otherwise copy
2252 * them out via the uio, then free. Sockbuf must be
2253 * consistent here (points to current mbuf, it points to next
2254 * record) when we drop priority; we must note any additions
2255 * to the sockbuf when we block interrupts again.
2256 */
2257 if (mp == NULL) {
2258 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2259 SBLASTRECORDCHK(&so->so_rcv);
2260 SBLASTMBUFCHK(&so->so_rcv);
2261 SOCKBUF_UNLOCK(&so->so_rcv);
2262 if ((m->m_flags & M_EXTPG) != 0)
2263 error = m_unmapped_uiomove(m, moff, uio,
2264 (int)len);
2265 else
2266 error = uiomove(mtod(m, char *) + moff,
2267 (int)len, uio);
2268 SOCKBUF_LOCK(&so->so_rcv);
2269 if (error) {
2270 /*
2271 * The MT_SONAME mbuf has already been removed
2272 * from the record, so it is necessary to
2273 * remove the data mbufs, if any, to preserve
2274 * the invariant in the case of PR_ADDR that
2275 * requires MT_SONAME mbufs at the head of
2276 * each record.
2277 */
2278 if (pr->pr_flags & PR_ATOMIC &&
2279 ((flags & MSG_PEEK) == 0))
2280 (void)sbdroprecord_locked(&so->so_rcv);
2281 SOCKBUF_UNLOCK(&so->so_rcv);
2282 goto release;
2283 }
2284 } else
2285 uio->uio_resid -= len;
2286 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2287 if (len == m->m_len - moff) {
2288 if (m->m_flags & M_EOR)
2289 flags |= MSG_EOR;
2290 if (flags & MSG_PEEK) {
2291 m = m->m_next;
2292 moff = 0;
2293 } else {
2294 nextrecord = m->m_nextpkt;
2295 sbfree(&so->so_rcv, m);
2296 if (mp != NULL) {
2297 m->m_nextpkt = NULL;
2298 *mp = m;
2299 mp = &m->m_next;
2300 so->so_rcv.sb_mb = m = m->m_next;
2301 *mp = NULL;
2302 } else {
2303 so->so_rcv.sb_mb = m_free(m);
2304 m = so->so_rcv.sb_mb;
2305 }
2306 sockbuf_pushsync(&so->so_rcv, nextrecord);
2307 SBLASTRECORDCHK(&so->so_rcv);
2308 SBLASTMBUFCHK(&so->so_rcv);
2309 }
2310 } else {
2311 if (flags & MSG_PEEK)
2312 moff += len;
2313 else {
2314 if (mp != NULL) {
2315 if (flags & MSG_DONTWAIT) {
2316 *mp = m_copym(m, 0, len,
2317 M_NOWAIT);
2318 if (*mp == NULL) {
2319 /*
2320 * m_copym() couldn't
2321 * allocate an mbuf.
2322 * Adjust uio_resid back
2323 * (it was adjusted
2324 * down by len bytes,
2325 * which we didn't end
2326 * up "copying" over).
2327 */
2328 uio->uio_resid += len;
2329 break;
2330 }
2331 } else {
2332 SOCKBUF_UNLOCK(&so->so_rcv);
2333 *mp = m_copym(m, 0, len,
2334 M_WAITOK);
2335 SOCKBUF_LOCK(&so->so_rcv);
2336 }
2337 }
2338 sbcut_locked(&so->so_rcv, len);
2339 }
2340 }
2341 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2342 if (so->so_oobmark) {
2343 if ((flags & MSG_PEEK) == 0) {
2344 so->so_oobmark -= len;
2345 if (so->so_oobmark == 0) {
2346 so->so_rcv.sb_state |= SBS_RCVATMARK;
2347 break;
2348 }
2349 } else {
2350 offset += len;
2351 if (offset == so->so_oobmark)
2352 break;
2353 }
2354 }
2355 if (flags & MSG_EOR)
2356 break;
2357 /*
2358 * If the MSG_WAITALL flag is set (for non-atomic socket), we
2359 * must not quit until "uio->uio_resid == 0" or an error
2360 * termination. If a signal/timeout occurs, return with a
2361 * short count but without error. Keep sockbuf locked
2362 * against other readers.
2363 */
2364 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2365 !sosendallatonce(so) && nextrecord == NULL) {
2366 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2367 if (so->so_error || so->so_rerror ||
2368 so->so_rcv.sb_state & SBS_CANTRCVMORE)
2369 break;
2370 /*
2371 * Notify the protocol that some data has been
2372 * drained before blocking.
2373 */
2374 if (pr->pr_flags & PR_WANTRCVD) {
2375 SOCKBUF_UNLOCK(&so->so_rcv);
2376 VNET_SO_ASSERT(so);
2377 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2378 SOCKBUF_LOCK(&so->so_rcv);
2379 }
2380 SBLASTRECORDCHK(&so->so_rcv);
2381 SBLASTMBUFCHK(&so->so_rcv);
2382 /*
2383 * We could receive some data while was notifying
2384 * the protocol. Skip blocking in this case.
2385 */
2386 if (so->so_rcv.sb_mb == NULL) {
2387 error = sbwait(&so->so_rcv);
2388 if (error) {
2389 SOCKBUF_UNLOCK(&so->so_rcv);
2390 goto release;
2391 }
2392 }
2393 m = so->so_rcv.sb_mb;
2394 if (m != NULL)
2395 nextrecord = m->m_nextpkt;
2396 }
2397 }
2398
2399 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2400 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2401 flags |= MSG_TRUNC;
2402 if ((flags & MSG_PEEK) == 0)
2403 (void) sbdroprecord_locked(&so->so_rcv);
2404 }
2405 if ((flags & MSG_PEEK) == 0) {
2406 if (m == NULL) {
2407 /*
2408 * First part is an inline SB_EMPTY_FIXUP(). Second
2409 * part makes sure sb_lastrecord is up-to-date if
2410 * there is still data in the socket buffer.
2411 */
2412 so->so_rcv.sb_mb = nextrecord;
2413 if (so->so_rcv.sb_mb == NULL) {
2414 so->so_rcv.sb_mbtail = NULL;
2415 so->so_rcv.sb_lastrecord = NULL;
2416 } else if (nextrecord->m_nextpkt == NULL)
2417 so->so_rcv.sb_lastrecord = nextrecord;
2418 }
2419 SBLASTRECORDCHK(&so->so_rcv);
2420 SBLASTMBUFCHK(&so->so_rcv);
2421 /*
2422 * If soreceive() is being done from the socket callback,
2423 * then don't need to generate ACK to peer to update window,
2424 * since ACK will be generated on return to TCP.
2425 */
2426 if (!(flags & MSG_SOCALLBCK) &&
2427 (pr->pr_flags & PR_WANTRCVD)) {
2428 SOCKBUF_UNLOCK(&so->so_rcv);
2429 VNET_SO_ASSERT(so);
2430 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2431 SOCKBUF_LOCK(&so->so_rcv);
2432 }
2433 }
2434 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2435 if (orig_resid == uio->uio_resid && orig_resid &&
2436 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2437 SOCKBUF_UNLOCK(&so->so_rcv);
2438 goto restart;
2439 }
2440 SOCKBUF_UNLOCK(&so->so_rcv);
2441
2442 if (flagsp != NULL)
2443 *flagsp |= flags;
2444release:
2445 SOCK_IO_RECV_UNLOCK(so);
2446 return (error);
2447}
2448
2449/*
2450 * Optimized version of soreceive() for stream (TCP) sockets.
2451 */
2452int
2453soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2454 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2455{
2456 int len = 0, error = 0, flags, oresid;
2457 struct sockbuf *sb;
2458 struct mbuf *m, *n = NULL;
2459
2460 /* We only do stream sockets. */
2461 if (so->so_type != SOCK_STREAM)
2462 return (EINVAL);
2463 if (psa != NULL)
2464 *psa = NULL;
2465 if (flagsp != NULL)
2466 flags = *flagsp &~ MSG_EOR;
2467 else
2468 flags = 0;
2469 if (controlp != NULL)
2470 *controlp = NULL;
2471 if (flags & MSG_OOB)
2472 return (soreceive_rcvoob(so, uio, flags));
2473 if (mp0 != NULL)
2474 *mp0 = NULL;
2475
2476 sb = &so->so_rcv;
2477
2478#ifdef KERN_TLS
2479 /*
2480 * KTLS store TLS records as records with a control message to
2481 * describe the framing.
2482 *
2483 * We check once here before acquiring locks to optimize the
2484 * common case.
2485 */
2486 if (sb->sb_tls_info != NULL)
2487 return (soreceive_generic(so, psa, uio, mp0, controlp,
2488 flagsp));
2489#endif
2490
2491 /* Prevent other readers from entering the socket. */
2492 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2493 if (error)
2494 return (error);
2495 SOCKBUF_LOCK(sb);
2496
2497#ifdef KERN_TLS
2498 if (sb->sb_tls_info != NULL) {
2499 SOCKBUF_UNLOCK(sb);
2500 SOCK_IO_RECV_UNLOCK(so);
2501 return (soreceive_generic(so, psa, uio, mp0, controlp,
2502 flagsp));
2503 }
2504#endif
2505
2506 /* Easy one, no space to copyout anything. */
2507 if (uio->uio_resid == 0) {
2508 error = EINVAL;
2509 goto out;
2510 }
2511 oresid = uio->uio_resid;
2512
2513 /* We will never ever get anything unless we are or were connected. */
2514 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2515 error = ENOTCONN;
2516 goto out;
2517 }
2518
2519restart:
2520 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2521
2522 /* Abort if socket has reported problems. */
2523 if (so->so_error) {
2524 if (sbavail(sb) > 0)
2525 goto deliver;
2526 if (oresid > uio->uio_resid)
2527 goto out;
2528 error = so->so_error;
2529 if (!(flags & MSG_PEEK))
2530 so->so_error = 0;
2531 goto out;
2532 }
2533
2534 /* Door is closed. Deliver what is left, if any. */
2535 if (sb->sb_state & SBS_CANTRCVMORE) {
2536 if (sbavail(sb) > 0)
2537 goto deliver;
2538 else
2539 goto out;
2540 }
2541
2542 /* Socket buffer is empty and we shall not block. */
2543 if (sbavail(sb) == 0 &&
2544 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2545 error = EAGAIN;
2546 goto out;
2547 }
2548
2549 /* Socket buffer got some data that we shall deliver now. */
2550 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2551 ((so->so_state & SS_NBIO) ||
2552 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2553 sbavail(sb) >= sb->sb_lowat ||
2554 sbavail(sb) >= uio->uio_resid ||
2555 sbavail(sb) >= sb->sb_hiwat) ) {
2556 goto deliver;
2557 }
2558
2559 /* On MSG_WAITALL we must wait until all data or error arrives. */
2560 if ((flags & MSG_WAITALL) &&
2561 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2562 goto deliver;
2563
2564 /*
2565 * Wait and block until (more) data comes in.
2566 * NB: Drops the sockbuf lock during wait.
2567 */
2568 error = sbwait(sb);
2569 if (error)
2570 goto out;
2571 goto restart;
2572
2573deliver:
2574 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2575 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2576 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2577
2578 /* Statistics. */
2579 if (uio->uio_td)
2580 uio->uio_td->td_ru.ru_msgrcv++;
2581
2582 /* Fill uio until full or current end of socket buffer is reached. */
2583 len = min(uio->uio_resid, sbavail(sb));
2584 if (mp0 != NULL) {
2585 /* Dequeue as many mbufs as possible. */
2586 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2587 if (*mp0 == NULL)
2588 *mp0 = sb->sb_mb;
2589 else
2590 m_cat(*mp0, sb->sb_mb);
2591 for (m = sb->sb_mb;
2592 m != NULL && m->m_len <= len;
2593 m = m->m_next) {
2594 KASSERT(!(m->m_flags & M_NOTAVAIL),
2595 ("%s: m %p not available", __func__, m));
2596 len -= m->m_len;
2597 uio->uio_resid -= m->m_len;
2598 sbfree(sb, m);
2599 n = m;
2600 }
2601 n->m_next = NULL;
2602 sb->sb_mb = m;
2603 sb->sb_lastrecord = sb->sb_mb;
2604 if (sb->sb_mb == NULL)
2605 SB_EMPTY_FIXUP(sb);
2606 }
2607 /* Copy the remainder. */
2608 if (len > 0) {
2609 KASSERT(sb->sb_mb != NULL,
2610 ("%s: len > 0 && sb->sb_mb empty", __func__));
2611
2612 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2613 if (m == NULL)
2614 len = 0; /* Don't flush data from sockbuf. */
2615 else
2616 uio->uio_resid -= len;
2617 if (*mp0 != NULL)
2618 m_cat(*mp0, m);
2619 else
2620 *mp0 = m;
2621 if (*mp0 == NULL) {
2622 error = ENOBUFS;
2623 goto out;
2624 }
2625 }
2626 } else {
2627 /* NB: Must unlock socket buffer as uiomove may sleep. */
2628 SOCKBUF_UNLOCK(sb);
2629 error = m_mbuftouio(uio, sb->sb_mb, len);
2630 SOCKBUF_LOCK(sb);
2631 if (error)
2632 goto out;
2633 }
2634 SBLASTRECORDCHK(sb);
2635 SBLASTMBUFCHK(sb);
2636
2637 /*
2638 * Remove the delivered data from the socket buffer unless we
2639 * were only peeking.
2640 */
2641 if (!(flags & MSG_PEEK)) {
2642 if (len > 0)
2643 sbdrop_locked(sb, len);
2644
2645 /* Notify protocol that we drained some data. */
2646 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2647 (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2648 !(flags & MSG_SOCALLBCK))) {
2649 SOCKBUF_UNLOCK(sb);
2650 VNET_SO_ASSERT(so);
2651 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2652 SOCKBUF_LOCK(sb);
2653 }
2654 }
2655
2656 /*
2657 * For MSG_WAITALL we may have to loop again and wait for
2658 * more data to come in.
2659 */
2660 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2661 goto restart;
2662out:
2663 SBLASTRECORDCHK(sb);
2664 SBLASTMBUFCHK(sb);
2665 SOCKBUF_UNLOCK(sb);
2666 SOCK_IO_RECV_UNLOCK(so);
2667 return (error);
2668}
2669
2670/*
2671 * Optimized version of soreceive() for simple datagram cases from userspace.
2672 * Unlike in the stream case, we're able to drop a datagram if copyout()
2673 * fails, and because we handle datagrams atomically, we don't need to use a
2674 * sleep lock to prevent I/O interlacing.
2675 */
2676int
2677soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2678 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2679{
2680 struct mbuf *m, *m2;
2681 int flags, error;
2682 ssize_t len;
2683 struct protosw *pr = so->so_proto;
2684 struct mbuf *nextrecord;
2685
2686 if (psa != NULL)
2687 *psa = NULL;
2688 if (controlp != NULL)
2689 *controlp = NULL;
2690 if (flagsp != NULL)
2691 flags = *flagsp &~ MSG_EOR;
2692 else
2693 flags = 0;
2694
2695 /*
2696 * For any complicated cases, fall back to the full
2697 * soreceive_generic().
2698 */
2699 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2700 return (soreceive_generic(so, psa, uio, mp0, controlp,
2701 flagsp));
2702
2703 /*
2704 * Enforce restrictions on use.
2705 */
2706 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2707 ("soreceive_dgram: wantrcvd"));
2708 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2709 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2710 ("soreceive_dgram: SBS_RCVATMARK"));
2711 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2712 ("soreceive_dgram: P_CONNREQUIRED"));
2713
2714 /*
2715 * Loop blocking while waiting for a datagram.
2716 */
2717 SOCKBUF_LOCK(&so->so_rcv);
2718 while ((m = so->so_rcv.sb_mb) == NULL) {
2719 KASSERT(sbavail(&so->so_rcv) == 0,
2720 ("soreceive_dgram: sb_mb NULL but sbavail %u",
2721 sbavail(&so->so_rcv)));
2722 if (so->so_error) {
2723 error = so->so_error;
2724 so->so_error = 0;
2725 SOCKBUF_UNLOCK(&so->so_rcv);
2726 return (error);
2727 }
2728 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2729 uio->uio_resid == 0) {
2730 SOCKBUF_UNLOCK(&so->so_rcv);
2731 return (0);
2732 }
2733 if ((so->so_state & SS_NBIO) ||
2734 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2735 SOCKBUF_UNLOCK(&so->so_rcv);
2736 return (EWOULDBLOCK);
2737 }
2738 SBLASTRECORDCHK(&so->so_rcv);
2739 SBLASTMBUFCHK(&so->so_rcv);
2740 error = sbwait(&so->so_rcv);
2741 if (error) {
2742 SOCKBUF_UNLOCK(&so->so_rcv);
2743 return (error);
2744 }
2745 }
2746 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2747
2748 if (uio->uio_td)
2749 uio->uio_td->td_ru.ru_msgrcv++;
2750 SBLASTRECORDCHK(&so->so_rcv);
2751 SBLASTMBUFCHK(&so->so_rcv);
2752 nextrecord = m->m_nextpkt;
2753 if (nextrecord == NULL) {
2754 KASSERT(so->so_rcv.sb_lastrecord == m,
2755 ("soreceive_dgram: lastrecord != m"));
2756 }
2757
2758 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2759 ("soreceive_dgram: m_nextpkt != nextrecord"));
2760
2761 /*
2762 * Pull 'm' and its chain off the front of the packet queue.
2763 */
2764 so->so_rcv.sb_mb = NULL;
2765 sockbuf_pushsync(&so->so_rcv, nextrecord);
2766
2767 /*
2768 * Walk 'm's chain and free that many bytes from the socket buffer.
2769 */
2770 for (m2 = m; m2 != NULL; m2 = m2->m_next)
2771 sbfree(&so->so_rcv, m2);
2772
2773 /*
2774 * Do a few last checks before we let go of the lock.
2775 */
2776 SBLASTRECORDCHK(&so->so_rcv);
2777 SBLASTMBUFCHK(&so->so_rcv);
2778 SOCKBUF_UNLOCK(&so->so_rcv);
2779
2780 if (pr->pr_flags & PR_ADDR) {
2781 KASSERT(m->m_type == MT_SONAME,
2782 ("m->m_type == %d", m->m_type));
2783 if (psa != NULL)
2784 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2785 M_NOWAIT);
2786 m = m_free(m);
2787 }
2788 if (m == NULL) {
2789 /* XXXRW: Can this happen? */
2790 return (0);
2791 }
2792
2793 /*
2794 * Packet to copyout() is now in 'm' and it is disconnected from the
2795 * queue.
2796 *
2797 * Process one or more MT_CONTROL mbufs present before any data mbufs
2798 * in the first mbuf chain on the socket buffer. We call into the
2799 * protocol to perform externalization (or freeing if controlp ==
2800 * NULL). In some cases there can be only MT_CONTROL mbufs without
2801 * MT_DATA mbufs.
2802 */
2803 if (m->m_type == MT_CONTROL) {
2804 struct mbuf *cm = NULL, *cmn;
2805 struct mbuf **cme = &cm;
2806
2807 do {
2808 m2 = m->m_next;
2809 m->m_next = NULL;
2810 *cme = m;
2811 cme = &(*cme)->m_next;
2812 m = m2;
2813 } while (m != NULL && m->m_type == MT_CONTROL);
2814 while (cm != NULL) {
2815 cmn = cm->m_next;
2816 cm->m_next = NULL;
2817 if (pr->pr_domain->dom_externalize != NULL) {
2818 error = (*pr->pr_domain->dom_externalize)
2819 (cm, controlp, flags);
2820 } else if (controlp != NULL)
2821 *controlp = cm;
2822 else
2823 m_freem(cm);
2824 if (controlp != NULL) {
2825 while (*controlp != NULL)
2826 controlp = &(*controlp)->m_next;
2827 }
2828 cm = cmn;
2829 }
2830 }
2831 KASSERT(m == NULL || m->m_type == MT_DATA,
2832 ("soreceive_dgram: !data"));
2833 while (m != NULL && uio->uio_resid > 0) {
2834 len = uio->uio_resid;
2835 if (len > m->m_len)
2836 len = m->m_len;
2837 error = uiomove(mtod(m, char *), (int)len, uio);
2838 if (error) {
2839 m_freem(m);
2840 return (error);
2841 }
2842 if (len == m->m_len)
2843 m = m_free(m);
2844 else {
2845 m->m_data += len;
2846 m->m_len -= len;
2847 }
2848 }
2849 if (m != NULL) {
2850 flags |= MSG_TRUNC;
2851 m_freem(m);
2852 }
2853 if (flagsp != NULL)
2854 *flagsp |= flags;
2855 return (0);
2856}
2857
2858int
2859soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2860 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2861{
2862 int error;
2863
2864 CURVNET_SET(so->so_vnet);
2865 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
2866 mp0, controlp, flagsp));
2867 CURVNET_RESTORE();
2868 return (error);
2869}
2870
2871int
2872soshutdown(struct socket *so, int how)
2873{
2874 struct protosw *pr;
2875 int error, soerror_enotconn;
2876
2877 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2878 return (EINVAL);
2879
2880 soerror_enotconn = 0;
2881 SOCK_LOCK(so);
2882 if ((so->so_state &
2883 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2884 /*
2885 * POSIX mandates us to return ENOTCONN when shutdown(2) is
2886 * invoked on a datagram sockets, however historically we would
2887 * actually tear socket down. This is known to be leveraged by
2888 * some applications to unblock process waiting in recvXXX(2)
2889 * by other process that it shares that socket with. Try to meet
2890 * both backward-compatibility and POSIX requirements by forcing
2891 * ENOTCONN but still asking protocol to perform pru_shutdown().
2892 */
2893 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
2894 SOCK_UNLOCK(so);
2895 return (ENOTCONN);
2896 }
2897 soerror_enotconn = 1;
2898 }
2899
2900 if (SOLISTENING(so)) {
2901 if (how != SHUT_WR) {
2902 so->so_error = ECONNABORTED;
2903 solisten_wakeup(so); /* unlocks so */
2904 } else {
2905 SOCK_UNLOCK(so);
2906 }
2907 goto done;
2908 }
2909 SOCK_UNLOCK(so);
2910
2911 CURVNET_SET(so->so_vnet);
2912 pr = so->so_proto;
2913 if (pr->pr_usrreqs->pru_flush != NULL)
2914 (*pr->pr_usrreqs->pru_flush)(so, how);
2915 if (how != SHUT_WR)
2916 sorflush(so);
2917 if (how != SHUT_RD) {
2918 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2919 wakeup(&so->so_timeo);
2920 CURVNET_RESTORE();
2921 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2922 }
2923 wakeup(&so->so_timeo);
2924 CURVNET_RESTORE();
2925
2926done:
2927 return (soerror_enotconn ? ENOTCONN : 0);
2928}
2929
2930void
2931sorflush(struct socket *so)
2932{
2933 struct socket aso;
2934 struct protosw *pr;
2935 int error;
2936
2937 VNET_SO_ASSERT(so);
2938
2939 /*
2940 * In order to avoid calling dom_dispose with the socket buffer mutex
2941 * held, we make a partial copy of the socket buffer and clear the
2942 * original. The new socket buffer copy won't have initialized locks so
2943 * we can only call routines that won't use or assert those locks.
2944 * Ideally calling socantrcvmore() would prevent data from being added
2945 * to the buffer, but currently it merely prevents buffered data from
2946 * being read by userspace. We make this effort to free buffered data
2947 * nonetheless.
2948 *
2949 * Dislodge threads currently blocked in receive and wait to acquire
2950 * a lock against other simultaneous readers before clearing the
2951 * socket buffer. Don't let our acquire be interrupted by a signal
2952 * despite any existing socket disposition on interruptable waiting.
2953 */
2954 socantrcvmore(so);
2955
2956 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
2957 if (error != 0) {
2958 KASSERT(SOLISTENING(so),
2959 ("%s: soiolock(%p) failed", __func__, so));
2960 return;
2961 }
2962
2963 SOCK_RECVBUF_LOCK(so);
2964 bzero(&aso, sizeof(aso));
2965 aso.so_pcb = so->so_pcb;
2966 bcopy(&so->so_rcv.sb_startzero, &aso.so_rcv.sb_startzero,
2967 offsetof(struct sockbuf, sb_endzero) -
2968 offsetof(struct sockbuf, sb_startzero));
2969 bzero(&so->so_rcv.sb_startzero,
2970 offsetof(struct sockbuf, sb_endzero) -
2971 offsetof(struct sockbuf, sb_startzero));
2972 SOCK_RECVBUF_UNLOCK(so);
2973 SOCK_IO_RECV_UNLOCK(so);
2974
2975 /*
2976 * Dispose of special rights and flush the copied socket. Don't call
2977 * any unsafe routines (that rely on locks being initialized) on aso.
2978 */
2979 pr = so->so_proto;
2980 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2981 (*pr->pr_domain->dom_dispose)(&aso);
2982 sbrelease_internal(&aso.so_rcv, so);
2983}
2984
2985/*
2986 * Wrapper for Socket established helper hook.
2987 * Parameters: socket, context of the hook point, hook id.
2988 */
2989static int inline
2990hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2991{
2992 struct socket_hhook_data hhook_data = {
2993 .so = so,
2994 .hctx = hctx,
2995 .m = NULL,
2996 .status = 0
2997 };
2998
2999 CURVNET_SET(so->so_vnet);
3000 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
3001 CURVNET_RESTORE();
3002
3003 /* Ugly but needed, since hhooks return void for now */
3004 return (hhook_data.status);
3005}
3006
3007/*
3008 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
3009 * additional variant to handle the case where the option value needs to be
3010 * some kind of integer, but not a specific size. In addition to their use
3011 * here, these functions are also called by the protocol-level pr_ctloutput()
3012 * routines.
3013 */
3014int
3015sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3016{
3017 size_t valsize;
3018
3019 /*
3020 * If the user gives us more than we wanted, we ignore it, but if we
3021 * don't get the minimum length the caller wants, we return EINVAL.
3022 * On success, sopt->sopt_valsize is set to however much we actually
3023 * retrieved.
3024 */
3025 if ((valsize = sopt->sopt_valsize) < minlen)
3026 return EINVAL;
3027 if (valsize > len)
3028 sopt->sopt_valsize = valsize = len;
3029
3030 if (sopt->sopt_td != NULL)
3031 return (copyin(sopt->sopt_val, buf, valsize));
3032
3033 bcopy(sopt->sopt_val, buf, valsize);
3034 return (0);
3035}
3036
3037/*
3038 * Kernel version of setsockopt(2).
3039 *
3040 * XXX: optlen is size_t, not socklen_t
3041 */
3042int
3043so_setsockopt(struct socket *so, int level, int optname, void *optval,
3044 size_t optlen)
3045{
3046 struct sockopt sopt;
3047
3048 sopt.sopt_level = level;
3049 sopt.sopt_name = optname;
3050 sopt.sopt_dir = SOPT_SET;
3051 sopt.sopt_val = optval;
3052 sopt.sopt_valsize = optlen;
3053 sopt.sopt_td = NULL;
3054 return (sosetopt(so, &sopt));
3055}
3056
3057int
3058sosetopt(struct socket *so, struct sockopt *sopt)
3059{
3060 int error, optval;
3061 struct linger l;
3062 struct timeval tv;
3063 sbintime_t val;
3064 uint32_t val32;
3065#ifdef MAC
3066 struct mac extmac;
3067#endif
3068
3069 CURVNET_SET(so->so_vnet);
3070 error = 0;
3071 if (sopt->sopt_level != SOL_SOCKET) {
3072 if (so->so_proto->pr_ctloutput != NULL)
3073 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3074 else
3075 error = ENOPROTOOPT;
3076 } else {
3077 switch (sopt->sopt_name) {
3078 case SO_ACCEPTFILTER:
3079 error = accept_filt_setopt(so, sopt);
3080 if (error)
3081 goto bad;
3082 break;
3083
3084 case SO_LINGER:
3085 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3086 if (error)
3087 goto bad;
3088 if (l.l_linger < 0 ||
3089 l.l_linger > USHRT_MAX ||
3090 l.l_linger > (INT_MAX / hz)) {
3091 error = EDOM;
3092 goto bad;
3093 }
3094 SOCK_LOCK(so);
3095 so->so_linger = l.l_linger;
3096 if (l.l_onoff)
3097 so->so_options |= SO_LINGER;
3098 else
3099 so->so_options &= ~SO_LINGER;
3100 SOCK_UNLOCK(so);
3101 break;
3102
3103 case SO_DEBUG:
3104 case SO_KEEPALIVE:
3105 case SO_DONTROUTE:
3106 case SO_USELOOPBACK:
3107 case SO_BROADCAST:
3108 case SO_REUSEADDR:
3109 case SO_REUSEPORT:
3110 case SO_REUSEPORT_LB:
3111 case SO_OOBINLINE:
3112 case SO_TIMESTAMP:
3113 case SO_BINTIME:
3114 case SO_NOSIGPIPE:
3115 case SO_NO_DDP:
3116 case SO_NO_OFFLOAD:
3117 case SO_RERROR:
3118 error = sooptcopyin(sopt, &optval, sizeof optval,
3119 sizeof optval);
3120 if (error)
3121 goto bad;
3122 SOCK_LOCK(so);
3123 if (optval)
3124 so->so_options |= sopt->sopt_name;
3125 else
3126 so->so_options &= ~sopt->sopt_name;
3127 SOCK_UNLOCK(so);
3128 break;
3129
3130 case SO_SETFIB:
3131 error = sooptcopyin(sopt, &optval, sizeof optval,
3132 sizeof optval);
3133 if (error)
3134 goto bad;
3135
3136 if (optval < 0 || optval >= rt_numfibs) {
3137 error = EINVAL;
3138 goto bad;
3139 }
3140 if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3141 (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3142 (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3143 so->so_fibnum = optval;
3144 else
3145 so->so_fibnum = 0;
3146 break;
3147
3148 case SO_USER_COOKIE:
3149 error = sooptcopyin(sopt, &val32, sizeof val32,
3150 sizeof val32);
3151 if (error)
3152 goto bad;
3153 so->so_user_cookie = val32;
3154 break;
3155
3156 case SO_SNDBUF:
3157 case SO_RCVBUF:
3158 case SO_SNDLOWAT:
3159 case SO_RCVLOWAT:
3160 error = sooptcopyin(sopt, &optval, sizeof optval,
3161 sizeof optval);
3162 if (error)
3163 goto bad;
3164
3165 /*
3166 * Values < 1 make no sense for any of these options,
3167 * so disallow them.
3168 */
3169 if (optval < 1) {
3170 error = EINVAL;
3171 goto bad;
3172 }
3173
3174 error = sbsetopt(so, sopt->sopt_name, optval);
3175 break;
3176
3177 case SO_SNDTIMEO:
3178 case SO_RCVTIMEO:
3179#ifdef COMPAT_FREEBSD32
3180 if (SV_CURPROC_FLAG(SV_ILP32)) {
3181 struct timeval32 tv32;
3182
3183 error = sooptcopyin(sopt, &tv32, sizeof tv32,
3184 sizeof tv32);
3185 CP(tv32, tv, tv_sec);
3186 CP(tv32, tv, tv_usec);
3187 } else
3188#endif
3189 error = sooptcopyin(sopt, &tv, sizeof tv,
3190 sizeof tv);
3191 if (error)
3192 goto bad;
3193 if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3194 tv.tv_usec >= 1000000) {
3195 error = EDOM;
3196 goto bad;
3197 }
3198 if (tv.tv_sec > INT32_MAX)
3199 val = SBT_MAX;
3200 else
3201 val = tvtosbt(tv);
3202 switch (sopt->sopt_name) {
3203 case SO_SNDTIMEO:
3204 so->so_snd.sb_timeo = val;
3205 break;
3206 case SO_RCVTIMEO:
3207 so->so_rcv.sb_timeo = val;
3208 break;
3209 }
3210 break;
3211
3212 case SO_LABEL:
3213#ifdef MAC
3214 error = sooptcopyin(sopt, &extmac, sizeof extmac,
3215 sizeof extmac);
3216 if (error)
3217 goto bad;
3218 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3219 so, &extmac);
3220#else
3221 error = EOPNOTSUPP;
3222#endif
3223 break;
3224
3225 case SO_TS_CLOCK:
3226 error = sooptcopyin(sopt, &optval, sizeof optval,
3227 sizeof optval);
3228 if (error)
3229 goto bad;
3230 if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3231 error = EINVAL;
3232 goto bad;
3233 }
3234 so->so_ts_clock = optval;
3235 break;
3236
3237 case SO_MAX_PACING_RATE:
3238 error = sooptcopyin(sopt, &val32, sizeof(val32),
3239 sizeof(val32));
3240 if (error)
3241 goto bad;
3242 so->so_max_pacing_rate = val32;
3243 break;
3244
3245 default:
3246 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3247 error = hhook_run_socket(so, sopt,
3248 HHOOK_SOCKET_OPT);
3249 else
3250 error = ENOPROTOOPT;
3251 break;
3252 }
3253 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3254 (void)(*so->so_proto->pr_ctloutput)(so, sopt);
3255 }
3256bad:
3257 CURVNET_RESTORE();
3258 return (error);
3259}
3260
3261/*
3262 * Helper routine for getsockopt.
3263 */
3264int
3265sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3266{
3267 int error;
3268 size_t valsize;
3269
3270 error = 0;
3271
3272 /*
3273 * Documented get behavior is that we always return a value, possibly
3274 * truncated to fit in the user's buffer. Traditional behavior is
3275 * that we always tell the user precisely how much we copied, rather
3276 * than something useful like the total amount we had available for
3277 * her. Note that this interface is not idempotent; the entire
3278 * answer must be generated ahead of time.
3279 */
3280 valsize = min(len, sopt->sopt_valsize);
3281 sopt->sopt_valsize = valsize;
3282 if (sopt->sopt_val != NULL) {
3283 if (sopt->sopt_td != NULL)
3284 error = copyout(buf, sopt->sopt_val, valsize);
3285 else
3286 bcopy(buf, sopt->sopt_val, valsize);
3287 }
3288 return (error);
3289}
3290
3291int
3292sogetopt(struct socket *so, struct sockopt *sopt)
3293{
3294 int error, optval;
3295 struct linger l;
3296 struct timeval tv;
3297#ifdef MAC
3298 struct mac extmac;
3299#endif
3300
3301 CURVNET_SET(so->so_vnet);
3302 error = 0;
3303 if (sopt->sopt_level != SOL_SOCKET) {
3304 if (so->so_proto->pr_ctloutput != NULL)
3305 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3306 else
3307 error = ENOPROTOOPT;
3308 CURVNET_RESTORE();
3309 return (error);
3310 } else {
3311 switch (sopt->sopt_name) {
3312 case SO_ACCEPTFILTER:
3313 error = accept_filt_getopt(so, sopt);
3314 break;
3315
3316 case SO_LINGER:
3317 SOCK_LOCK(so);
3318 l.l_onoff = so->so_options & SO_LINGER;
3319 l.l_linger = so->so_linger;
3320 SOCK_UNLOCK(so);
3321 error = sooptcopyout(sopt, &l, sizeof l);
3322 break;
3323
3324 case SO_USELOOPBACK:
3325 case SO_DONTROUTE:
3326 case SO_DEBUG:
3327 case SO_KEEPALIVE:
3328 case SO_REUSEADDR:
3329 case SO_REUSEPORT:
3330 case SO_REUSEPORT_LB:
3331 case SO_BROADCAST:
3332 case SO_OOBINLINE:
3333 case SO_ACCEPTCONN:
3334 case SO_TIMESTAMP:
3335 case SO_BINTIME:
3336 case SO_NOSIGPIPE:
3337 case SO_NO_DDP:
3338 case SO_NO_OFFLOAD:
3339 case SO_RERROR:
3340 optval = so->so_options & sopt->sopt_name;
3341integer:
3342 error = sooptcopyout(sopt, &optval, sizeof optval);
3343 break;
3344
3345 case SO_DOMAIN:
3346 optval = so->so_proto->pr_domain->dom_family;
3347 goto integer;
3348
3349 case SO_TYPE:
3350 optval = so->so_type;
3351 goto integer;
3352
3353 case SO_PROTOCOL:
3354 optval = so->so_proto->pr_protocol;
3355 goto integer;
3356
3357 case SO_ERROR:
3358 SOCK_LOCK(so);
3359 if (so->so_error) {
3360 optval = so->so_error;
3361 so->so_error = 0;
3362 } else {
3363 optval = so->so_rerror;
3364 so->so_rerror = 0;
3365 }
3366 SOCK_UNLOCK(so);
3367 goto integer;
3368
3369 case SO_SNDBUF:
3370 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3371 so->so_snd.sb_hiwat;
3372 goto integer;
3373
3374 case SO_RCVBUF:
3375 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3376 so->so_rcv.sb_hiwat;
3377 goto integer;
3378
3379 case SO_SNDLOWAT:
3380 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3381 so->so_snd.sb_lowat;
3382 goto integer;
3383
3384 case SO_RCVLOWAT:
3385 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3386 so->so_rcv.sb_lowat;
3387 goto integer;
3388
3389 case SO_SNDTIMEO:
3390 case SO_RCVTIMEO:
3391 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3392 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3393#ifdef COMPAT_FREEBSD32
3394 if (SV_CURPROC_FLAG(SV_ILP32)) {
3395 struct timeval32 tv32;
3396
3397 CP(tv, tv32, tv_sec);
3398 CP(tv, tv32, tv_usec);
3399 error = sooptcopyout(sopt, &tv32, sizeof tv32);
3400 } else
3401#endif
3402 error = sooptcopyout(sopt, &tv, sizeof tv);
3403 break;
3404
3405 case SO_LABEL:
3406#ifdef MAC
3407 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3408 sizeof(extmac));
3409 if (error)
3410 goto bad;
3411 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3412 so, &extmac);
3413 if (error)
3414 goto bad;
3415 error = sooptcopyout(sopt, &extmac, sizeof extmac);
3416#else
3417 error = EOPNOTSUPP;
3418#endif
3419 break;
3420
3421 case SO_PEERLABEL:
3422#ifdef MAC
3423 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3424 sizeof(extmac));
3425 if (error)
3426 goto bad;
3427 error = mac_getsockopt_peerlabel(
3428 sopt->sopt_td->td_ucred, so, &extmac);
3429 if (error)
3430 goto bad;
3431 error = sooptcopyout(sopt, &extmac, sizeof extmac);
3432#else
3433 error = EOPNOTSUPP;
3434#endif
3435 break;
3436
3437 case SO_LISTENQLIMIT:
3438 optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3439 goto integer;
3440
3441 case SO_LISTENQLEN:
3442 optval = SOLISTENING(so) ? so->sol_qlen : 0;
3443 goto integer;
3444
3445 case SO_LISTENINCQLEN:
3446 optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3447 goto integer;
3448
3449 case SO_TS_CLOCK:
3450 optval = so->so_ts_clock;
3451 goto integer;
3452
3453 case SO_MAX_PACING_RATE:
3454 optval = so->so_max_pacing_rate;
3455 goto integer;
3456
3457 default:
3458 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3459 error = hhook_run_socket(so, sopt,
3460 HHOOK_SOCKET_OPT);
3461 else
3462 error = ENOPROTOOPT;
3463 break;
3464 }
3465 }
3466#ifdef MAC
3467bad:
3468#endif
3469 CURVNET_RESTORE();
3470 return (error);
3471}
3472
3473int
3474soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3475{
3476 struct mbuf *m, *m_prev;
3477 int sopt_size = sopt->sopt_valsize;
3478
3479 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3480 if (m == NULL)
3481 return ENOBUFS;
3482 if (sopt_size > MLEN) {
3483 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3484 if ((m->m_flags & M_EXT) == 0) {
3485 m_free(m);
3486 return ENOBUFS;
3487 }
3488 m->m_len = min(MCLBYTES, sopt_size);
3489 } else {
3490 m->m_len = min(MLEN, sopt_size);
3491 }
3492 sopt_size -= m->m_len;
3493 *mp = m;
3494 m_prev = m;
3495
3496 while (sopt_size) {
3497 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3498 if (m == NULL) {
3499 m_freem(*mp);
3500 return ENOBUFS;
3501 }
3502 if (sopt_size > MLEN) {
3503 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3504 M_NOWAIT);
3505 if ((m->m_flags & M_EXT) == 0) {
3506 m_freem(m);
3507 m_freem(*mp);
3508 return ENOBUFS;
3509 }
3510 m->m_len = min(MCLBYTES, sopt_size);
3511 } else {
3512 m->m_len = min(MLEN, sopt_size);
3513 }
3514 sopt_size -= m->m_len;
3515 m_prev->m_next = m;
3516 m_prev = m;
3517 }
3518 return (0);
3519}
3520
3521int
3522soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3523{
3524 struct mbuf *m0 = m;
3525
3526 if (sopt->sopt_val == NULL)
3527 return (0);
3528 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3529 if (sopt->sopt_td != NULL) {
3530 int error;
3531
3532 error = copyin(sopt->sopt_val, mtod(m, char *),
3533 m->m_len);
3534 if (error != 0) {
3535 m_freem(m0);
3536 return(error);
3537 }
3538 } else
3539 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3540 sopt->sopt_valsize -= m->m_len;
3541 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3542 m = m->m_next;
3543 }
3544 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3545 panic("ip6_sooptmcopyin");
3546 return (0);
3547}
3548
3549int
3550soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3551{
3552 struct mbuf *m0 = m;
3553 size_t valsize = 0;
3554
3555 if (sopt->sopt_val == NULL)
3556 return (0);
3557 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3558 if (sopt->sopt_td != NULL) {
3559 int error;
3560
3561 error = copyout(mtod(m, char *), sopt->sopt_val,
3562 m->m_len);
3563 if (error != 0) {
3564 m_freem(m0);
3565 return(error);
3566 }
3567 } else
3568 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3569 sopt->sopt_valsize -= m->m_len;
3570 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3571 valsize += m->m_len;
3572 m = m->m_next;
3573 }
3574 if (m != NULL) {
3575 /* enough soopt buffer should be given from user-land */
3576 m_freem(m0);
3577 return(EINVAL);
3578 }
3579 sopt->sopt_valsize = valsize;
3580 return (0);
3581}
3582
3583/*
3584 * sohasoutofband(): protocol notifies socket layer of the arrival of new
3585 * out-of-band data, which will then notify socket consumers.
3586 */
3587void
3588sohasoutofband(struct socket *so)
3589{
3590
3591 if (so->so_sigio != NULL)
3592 pgsigio(&so->so_sigio, SIGURG, 0);
3593 selwakeuppri(&so->so_rdsel, PSOCK);
3594}
3595
3596int
3597sopoll(struct socket *so, int events, struct ucred *active_cred,
3598 struct thread *td)
3599{
3600
3601 /*
3602 * We do not need to set or assert curvnet as long as everyone uses
3603 * sopoll_generic().
3604 */
3605 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3606 td));
3607}
3608
3609int
3610sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3611 struct thread *td)
3612{
3613 int revents;
3614
3615 SOCK_LOCK(so);
3616 if (SOLISTENING(so)) {
3617 if (!(events & (POLLIN | POLLRDNORM)))
3618 revents = 0;
3619 else if (!TAILQ_EMPTY(&so->sol_comp))
3620 revents = events & (POLLIN | POLLRDNORM);
3621 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3622 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3623 else {
3624 selrecord(td, &so->so_rdsel);
3625 revents = 0;
3626 }
3627 } else {
3628 revents = 0;
3629 SOCKBUF_LOCK(&so->so_snd);
3630 SOCKBUF_LOCK(&so->so_rcv);
3631 if (events & (POLLIN | POLLRDNORM))
3632 if (soreadabledata(so))
3633 revents |= events & (POLLIN | POLLRDNORM);
3634 if (events & (POLLOUT | POLLWRNORM))
3635 if (sowriteable(so))
3636 revents |= events & (POLLOUT | POLLWRNORM);
3637 if (events & (POLLPRI | POLLRDBAND))
3638 if (so->so_oobmark ||
3639 (so->so_rcv.sb_state & SBS_RCVATMARK))
3640 revents |= events & (POLLPRI | POLLRDBAND);
3641 if ((events & POLLINIGNEOF) == 0) {
3642 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3643 revents |= events & (POLLIN | POLLRDNORM);
3644 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3645 revents |= POLLHUP;
3646 }
3647 }
3648 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3649 revents |= events & POLLRDHUP;
3650 if (revents == 0) {
3651 if (events &
3652 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
3653 selrecord(td, &so->so_rdsel);
3654 so->so_rcv.sb_flags |= SB_SEL;
3655 }
3656 if (events & (POLLOUT | POLLWRNORM)) {
3657 selrecord(td, &so->so_wrsel);
3658 so->so_snd.sb_flags |= SB_SEL;
3659 }
3660 }
3661 SOCKBUF_UNLOCK(&so->so_rcv);
3662 SOCKBUF_UNLOCK(&so->so_snd);
3663 }
3664 SOCK_UNLOCK(so);
3665 return (revents);
3666}
3667
3668int
3669soo_kqfilter(struct file *fp, struct knote *kn)
3670{
3671 struct socket *so = kn->kn_fp->f_data;
3672 struct sockbuf *sb;
3673 struct knlist *knl;
3674
3675 switch (kn->kn_filter) {
3676 case EVFILT_READ:
3677 kn->kn_fop = &soread_filtops;
3678 knl = &so->so_rdsel.si_note;
3679 sb = &so->so_rcv;
3680 break;
3681 case EVFILT_WRITE:
3682 kn->kn_fop = &sowrite_filtops;
3683 knl = &so->so_wrsel.si_note;
3684 sb = &so->so_snd;
3685 break;
3686 case EVFILT_EMPTY:
3687 kn->kn_fop = &soempty_filtops;
3688 knl = &so->so_wrsel.si_note;
3689 sb = &so->so_snd;
3690 break;
3691 default:
3692 return (EINVAL);
3693 }
3694
3695 SOCK_LOCK(so);
3696 if (SOLISTENING(so)) {
3697 knlist_add(knl, kn, 1);
3698 } else {
3699 SOCKBUF_LOCK(sb);
3700 knlist_add(knl, kn, 1);
3701 sb->sb_flags |= SB_KNOTE;
3702 SOCKBUF_UNLOCK(sb);
3703 }
3704 SOCK_UNLOCK(so);
3705 return (0);
3706}
3707
3708/*
3709 * Some routines that return EOPNOTSUPP for entry points that are not
3710 * supported by a protocol. Fill in as needed.
3711 */
3712int
3713pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3714{
3715
3716 return EOPNOTSUPP;
3717}
3718
3719int
3720pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
3721{
3722
3723 return EOPNOTSUPP;
3724}
3725
3726int
3727pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3728{
3729
3730 return EOPNOTSUPP;
3731}
3732
3733int
3734pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3735{
3736
3737 return EOPNOTSUPP;
3738}
3739
3740int
3741pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3742 struct thread *td)
3743{
3744
3745 return EOPNOTSUPP;
3746}
3747
3748int
3749pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3750{
3751
3752 return EOPNOTSUPP;
3753}
3754
3755int
3756pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3757 struct thread *td)
3758{
3759
3760 return EOPNOTSUPP;
3761}
3762
3763int
3764pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3765{
3766
3767 return EOPNOTSUPP;
3768}
3769
3770int
3771pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3772 struct ifnet *ifp, struct thread *td)
3773{
3774
3775 return EOPNOTSUPP;
3776}
3777
3778int
3779pru_disconnect_notsupp(struct socket *so)
3780{
3781
3782 return EOPNOTSUPP;
3783}
3784
3785int
3786pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3787{
3788
3789 return EOPNOTSUPP;
3790}
3791
3792int
3793pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3794{
3795
3796 return EOPNOTSUPP;
3797}
3798
3799int
3800pru_rcvd_notsupp(struct socket *so, int flags)
3801{
3802
3803 return EOPNOTSUPP;
3804}
3805
3806int
3807pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3808{
3809
3810 return EOPNOTSUPP;
3811}
3812
3813int
3814pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3815 struct sockaddr *addr, struct mbuf *control, struct thread *td)
3816{
3817
3818 if (control != NULL)
3819 m_freem(control);
3820 if ((flags & PRUS_NOTREADY) == 0)
3821 m_freem(m);
3822 return (EOPNOTSUPP);
3823}
3824
3825int
3826pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
3827{
3828
3829 return (EOPNOTSUPP);
3830}
3831
3832/*
3833 * This isn't really a ``null'' operation, but it's the default one and
3834 * doesn't do anything destructive.
3835 */
3836int
3837pru_sense_null(struct socket *so, struct stat *sb)
3838{
3839
3840 sb->st_blksize = so->so_snd.sb_hiwat;
3841 return 0;
3842}
3843
3844int
3845pru_shutdown_notsupp(struct socket *so)
3846{
3847
3848 return EOPNOTSUPP;
3849}
3850
3851int
3852pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3853{
3854
3855 return EOPNOTSUPP;
3856}
3857
3858int
3859pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3860 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3861{
3862
3863 return EOPNOTSUPP;
3864}
3865
3866int
3867pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3868 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3869{
3870
3871 return EOPNOTSUPP;
3872}
3873
3874int
3875pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3876 struct thread *td)
3877{
3878
3879 return EOPNOTSUPP;
3880}
3881
3882static void
3884{
3885 struct socket *so = kn->kn_fp->f_data;
3886
3887 so_rdknl_lock(so);
3888 knlist_remove(&so->so_rdsel.si_note, kn, 1);
3889 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3890 so->so_rcv.sb_flags &= ~SB_KNOTE;
3891 so_rdknl_unlock(so);
3892}
3893
3894/*ARGSUSED*/
3895static int
3896filt_soread(struct knote *kn, long hint)
3897{
3898 struct socket *so;
3899
3900 so = kn->kn_fp->f_data;
3901
3902 if (SOLISTENING(so)) {
3903 SOCK_LOCK_ASSERT(so);
3904 kn->kn_data = so->sol_qlen;
3905 if (so->so_error) {
3906 kn->kn_flags |= EV_EOF;
3907 kn->kn_fflags = so->so_error;
3908 return (1);
3909 }
3910 return (!TAILQ_EMPTY(&so->sol_comp));
3911 }
3912
3913 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3914
3915 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3916 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3917 kn->kn_flags |= EV_EOF;
3918 kn->kn_fflags = so->so_error;
3919 return (1);
3920 } else if (so->so_error || so->so_rerror)
3921 return (1);
3922
3923 if (kn->kn_sfflags & NOTE_LOWAT) {
3924 if (kn->kn_data >= kn->kn_sdata)
3925 return (1);
3926 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3927 return (1);
3928
3929 /* This hook returning non-zero indicates an event, not error */
3930 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3931}
3932
3933static void
3935{
3936 struct socket *so = kn->kn_fp->f_data;
3937
3938 so_wrknl_lock(so);
3939 knlist_remove(&so->so_wrsel.si_note, kn, 1);
3940 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3941 so->so_snd.sb_flags &= ~SB_KNOTE;
3942 so_wrknl_unlock(so);
3943}
3944
3945/*ARGSUSED*/
3946static int
3947filt_sowrite(struct knote *kn, long hint)
3948{
3949 struct socket *so;
3950
3951 so = kn->kn_fp->f_data;
3952
3953 if (SOLISTENING(so))
3954 return (0);
3955
3956 SOCKBUF_LOCK_ASSERT(&so->so_snd);
3957 kn->kn_data = sbspace(&so->so_snd);
3958
3959 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3960
3961 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3962 kn->kn_flags |= EV_EOF;
3963 kn->kn_fflags = so->so_error;
3964 return (1);
3965 } else if (so->so_error) /* temporary udp error */
3966 return (1);
3967 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3968 (so->so_proto->pr_flags & PR_CONNREQUIRED))
3969 return (0);
3970 else if (kn->kn_sfflags & NOTE_LOWAT)
3971 return (kn->kn_data >= kn->kn_sdata);
3972 else
3973 return (kn->kn_data >= so->so_snd.sb_lowat);
3974}
3975
3976static int
3977filt_soempty(struct knote *kn, long hint)
3978{
3979 struct socket *so;
3980
3981 so = kn->kn_fp->f_data;
3982
3983 if (SOLISTENING(so))
3984 return (1);
3985
3986 SOCKBUF_LOCK_ASSERT(&so->so_snd);
3987 kn->kn_data = sbused(&so->so_snd);
3988
3989 if (kn->kn_data == 0)
3990 return (1);
3991 else
3992 return (0);
3993}
3994
3995int
3996socheckuid(struct socket *so, uid_t uid)
3997{
3998
3999 if (so == NULL)
4000 return (EPERM);
4001 if (so->so_cred->cr_uid != uid)
4002 return (EPERM);
4003 return (0);
4004}
4005
4006/*
4007 * These functions are used by protocols to notify the socket layer (and its
4008 * consumers) of state changes in the sockets driven by protocol-side events.
4009 */
4010
4011/*
4012 * Procedures to manipulate state flags of socket and do appropriate wakeups.
4013 *
4014 * Normal sequence from the active (originating) side is that
4015 * soisconnecting() is called during processing of connect() call, resulting
4016 * in an eventual call to soisconnected() if/when the connection is
4017 * established. When the connection is torn down soisdisconnecting() is
4018 * called during processing of disconnect() call, and soisdisconnected() is
4019 * called when the connection to the peer is totally severed. The semantics
4020 * of these routines are such that connectionless protocols can call
4021 * soisconnected() and soisdisconnected() only, bypassing the in-progress
4022 * calls when setting up a ``connection'' takes no time.
4023 *
4024 * From the passive side, a socket is created with two queues of sockets:
4025 * so_incomp for connections in progress and so_comp for connections already
4026 * made and awaiting user acceptance. As a protocol is preparing incoming
4027 * connections, it creates a socket structure queued on so_incomp by calling
4028 * sonewconn(). When the connection is established, soisconnected() is
4029 * called, and transfers the socket structure to so_comp, making it available
4030 * to accept().
4031 *
4032 * If a socket is closed with sockets on either so_incomp or so_comp, these
4033 * sockets are dropped.
4034 *
4035 * If higher-level protocols are implemented in the kernel, the wakeups done
4036 * here will sometimes cause software-interrupt process scheduling.
4037 */
4038void
4039soisconnecting(struct socket *so)
4040{
4041
4042 SOCK_LOCK(so);
4043 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
4044 so->so_state |= SS_ISCONNECTING;
4045 SOCK_UNLOCK(so);
4046}
4047
4048void
4049soisconnected(struct socket *so)
4050{
4051 bool last __diagused;
4052
4053 SOCK_LOCK(so);
4054 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
4055 so->so_state |= SS_ISCONNECTED;
4056
4057 if (so->so_qstate == SQ_INCOMP) {
4058 struct socket *head = so->so_listen;
4059 int ret;
4060
4061 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
4062 /*
4063 * Promoting a socket from incomplete queue to complete, we
4064 * need to go through reverse order of locking. We first do
4065 * trylock, and if that doesn't succeed, we go the hard way
4066 * leaving a reference and rechecking consistency after proper
4067 * locking.
4068 */
4069 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
4070 soref(head);
4071 SOCK_UNLOCK(so);
4072 SOLISTEN_LOCK(head);
4073 SOCK_LOCK(so);
4074 if (__predict_false(head != so->so_listen)) {
4075 /*
4076 * The socket went off the listen queue,
4077 * should be lost race to close(2) of sol.
4078 * The socket is about to soabort().
4079 */
4080 SOCK_UNLOCK(so);
4081 sorele_locked(head);
4082 return;
4083 }
4084 last = refcount_release(&head->so_count);
4085 KASSERT(!last, ("%s: released last reference for %p",
4086 __func__, head));
4087 }
4088again:
4089 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
4090 TAILQ_REMOVE(&head->sol_incomp, so, so_list);
4091 head->sol_incqlen--;
4092 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
4093 head->sol_qlen++;
4094 so->so_qstate = SQ_COMP;
4095 SOCK_UNLOCK(so);
4096 solisten_wakeup(head); /* unlocks */
4097 } else {
4098 SOCKBUF_LOCK(&so->so_rcv);
4099 soupcall_set(so, SO_RCV,
4100 head->sol_accept_filter->accf_callback,
4101 head->sol_accept_filter_arg);
4102 so->so_options &= ~SO_ACCEPTFILTER;
4103 ret = head->sol_accept_filter->accf_callback(so,
4104 head->sol_accept_filter_arg, M_NOWAIT);
4105 if (ret == SU_ISCONNECTED) {
4106 soupcall_clear(so, SO_RCV);
4107 SOCKBUF_UNLOCK(&so->so_rcv);
4108 goto again;
4109 }
4110 SOCKBUF_UNLOCK(&so->so_rcv);
4111 SOCK_UNLOCK(so);
4112 SOLISTEN_UNLOCK(head);
4113 }
4114 return;
4115 }
4116 SOCK_UNLOCK(so);
4117 wakeup(&so->so_timeo);
4118 sorwakeup(so);
4119 sowwakeup(so);
4120}
4121
4122void
4123soisdisconnecting(struct socket *so)
4124{
4125
4126 SOCK_LOCK(so);
4127 so->so_state &= ~SS_ISCONNECTING;
4128 so->so_state |= SS_ISDISCONNECTING;
4129
4130 if (!SOLISTENING(so)) {
4131 SOCKBUF_LOCK(&so->so_rcv);
4133 SOCKBUF_LOCK(&so->so_snd);
4135 }
4136 SOCK_UNLOCK(so);
4137 wakeup(&so->so_timeo);
4138}
4139
4140void
4141soisdisconnected(struct socket *so)
4142{
4143
4144 SOCK_LOCK(so);
4145
4146 /*
4147 * There is at least one reader of so_state that does not
4148 * acquire socket lock, namely soreceive_generic(). Ensure
4149 * that it never sees all flags that track connection status
4150 * cleared, by ordering the update with a barrier semantic of
4151 * our release thread fence.
4152 */
4153 so->so_state |= SS_ISDISCONNECTED;
4154 atomic_thread_fence_rel();
4155 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
4156
4157 if (!SOLISTENING(so)) {
4158 SOCK_UNLOCK(so);
4159 SOCKBUF_LOCK(&so->so_rcv);
4161 SOCKBUF_LOCK(&so->so_snd);
4162 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4164 } else
4165 SOCK_UNLOCK(so);
4166 wakeup(&so->so_timeo);
4167}
4168
4169int
4170soiolock(struct socket *so, struct sx *sx, int flags)
4171{
4172 int error;
4173
4174 KASSERT((flags & SBL_VALID) == flags,
4175 ("soiolock: invalid flags %#x", flags));
4176
4177 if ((flags & SBL_WAIT) != 0) {
4178 if ((flags & SBL_NOINTR) != 0) {
4179 sx_xlock(sx);
4180 } else {
4181 error = sx_xlock_sig(sx);
4182 if (error != 0)
4183 return (error);
4184 }
4185 } else if (!sx_try_xlock(sx)) {
4186 return (EWOULDBLOCK);
4187 }
4188
4189 if (__predict_false(SOLISTENING(so))) {
4190 sx_xunlock(sx);
4191 return (ENOTCONN);
4192 }
4193 return (0);
4194}
4195
4196void
4197soiounlock(struct sx *sx)
4198{
4199 sx_xunlock(sx);
4200}
4201
4202/*
4203 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4204 */
4205struct sockaddr *
4206sodupsockaddr(const struct sockaddr *sa, int mflags)
4207{
4208 struct sockaddr *sa2;
4209
4210 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4211 if (sa2)
4212 bcopy(sa, sa2, sa->sa_len);
4213 return sa2;
4214}
4215
4216/*
4217 * Register per-socket destructor.
4218 */
4219void
4220sodtor_set(struct socket *so, so_dtor_t *func)
4221{
4222
4223 SOCK_LOCK_ASSERT(so);
4224 so->so_dtor = func;
4225}
4226
4227/*
4228 * Register per-socket buffer upcalls.
4229 */
4230void
4231soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
4232{
4233 struct sockbuf *sb;
4234
4235 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4236
4237 switch (which) {
4238 case SO_RCV:
4239 sb = &so->so_rcv;
4240 break;
4241 case SO_SND:
4242 sb = &so->so_snd;
4243 break;
4244 default:
4245 panic("soupcall_set: bad which");
4246 }
4247 SOCKBUF_LOCK_ASSERT(sb);
4248 sb->sb_upcall = func;
4249 sb->sb_upcallarg = arg;
4250 sb->sb_flags |= SB_UPCALL;
4251}
4252
4253void
4254soupcall_clear(struct socket *so, int which)
4255{
4256 struct sockbuf *sb;
4257
4258 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4259
4260 switch (which) {
4261 case SO_RCV:
4262 sb = &so->so_rcv;
4263 break;
4264 case SO_SND:
4265 sb = &so->so_snd;
4266 break;
4267 default:
4268 panic("soupcall_clear: bad which");
4269 }
4270 SOCKBUF_LOCK_ASSERT(sb);
4271 KASSERT(sb->sb_upcall != NULL,
4272 ("%s: so %p no upcall to clear", __func__, so));
4273 sb->sb_upcall = NULL;
4274 sb->sb_upcallarg = NULL;
4275 sb->sb_flags &= ~SB_UPCALL;
4276}
4277
4278void
4279solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4280{
4281
4282 SOLISTEN_LOCK_ASSERT(so);
4283 so->sol_upcall = func;
4284 so->sol_upcallarg = arg;
4285}
4286
4287static void
4289{
4290 struct socket *so = arg;
4291
4292 if (SOLISTENING(so))
4293 SOCK_LOCK(so);
4294 else
4295 SOCKBUF_LOCK(&so->so_rcv);
4296}
4297
4298static void
4300{
4301 struct socket *so = arg;
4302
4303 if (SOLISTENING(so))
4304 SOCK_UNLOCK(so);
4305 else
4306 SOCKBUF_UNLOCK(&so->so_rcv);
4307}
4308
4309static void
4310so_rdknl_assert_lock(void *arg, int what)
4311{
4312 struct socket *so = arg;
4313
4314 if (what == LA_LOCKED) {
4315 if (SOLISTENING(so))
4316 SOCK_LOCK_ASSERT(so);
4317 else
4318 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
4319 } else {
4320 if (SOLISTENING(so))
4321 SOCK_UNLOCK_ASSERT(so);
4322 else
4323 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
4324 }
4325}
4326
4327static void
4329{
4330 struct socket *so = arg;
4331
4332 if (SOLISTENING(so))
4333 SOCK_LOCK(so);
4334 else
4335 SOCKBUF_LOCK(&so->so_snd);
4336}
4337
4338static void
4340{
4341 struct socket *so = arg;
4342
4343 if (SOLISTENING(so))
4344 SOCK_UNLOCK(so);
4345 else
4346 SOCKBUF_UNLOCK(&so->so_snd);
4347}
4348
4349static void
4350so_wrknl_assert_lock(void *arg, int what)
4351{
4352 struct socket *so = arg;
4353
4354 if (what == LA_LOCKED) {
4355 if (SOLISTENING(so))
4356 SOCK_LOCK_ASSERT(so);
4357 else
4358 SOCKBUF_LOCK_ASSERT(&so->so_snd);
4359 } else {
4360 if (SOLISTENING(so))
4361 SOCK_UNLOCK_ASSERT(so);
4362 else
4363 SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
4364 }
4365}
4366
4367/*
4368 * Create an external-format (``xsocket'') structure using the information in
4369 * the kernel-format socket structure pointed to by so. This is done to
4370 * reduce the spew of irrelevant information over this interface, to isolate
4371 * user code from changes in the kernel structure, and potentially to provide
4372 * information-hiding if we decide that some of this information should be
4373 * hidden from users.
4374 */
4375void
4376sotoxsocket(struct socket *so, struct xsocket *xso)
4377{
4378
4379 bzero(xso, sizeof(*xso));
4380 xso->xso_len = sizeof *xso;
4381 xso->xso_so = (uintptr_t)so;
4382 xso->so_type = so->so_type;
4383 xso->so_options = so->so_options;
4384 xso->so_linger = so->so_linger;
4385 xso->so_state = so->so_state;
4386 xso->so_pcb = (uintptr_t)so->so_pcb;
4387 xso->xso_protocol = so->so_proto->pr_protocol;
4388 xso->xso_family = so->so_proto->pr_domain->dom_family;
4389 xso->so_timeo = so->so_timeo;
4390 xso->so_error = so->so_error;
4391 xso->so_uid = so->so_cred->cr_uid;
4392 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4393 if (SOLISTENING(so)) {
4394 xso->so_qlen = so->sol_qlen;
4395 xso->so_incqlen = so->sol_incqlen;
4396 xso->so_qlimit = so->sol_qlimit;
4397 xso->so_oobmark = 0;
4398 } else {
4399 xso->so_state |= so->so_qstate;
4400 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4401 xso->so_oobmark = so->so_oobmark;
4402 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4403 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4404 }
4405}
4406
4407struct sockbuf *
4408so_sockbuf_rcv(struct socket *so)
4409{
4410
4411 return (&so->so_rcv);
4412}
4413
4414struct sockbuf *
4415so_sockbuf_snd(struct socket *so)
4416{
4417
4418 return (&so->so_snd);
4419}
4420
4421int
4422so_state_get(const struct socket *so)
4423{
4424
4425 return (so->so_state);
4426}
4427
4428void
4429so_state_set(struct socket *so, int val)
4430{
4431
4432 so->so_state = val;
4433}
4434
4435int
4436so_options_get(const struct socket *so)
4437{
4438
4439 return (so->so_options);
4440}
4441
4442void
4443so_options_set(struct socket *so, int val)
4444{
4445
4446 so->so_options = val;
4447}
4448
4449int
4450so_error_get(const struct socket *so)
4451{
4452
4453 return (so->so_error);
4454}
4455
4456void
4457so_error_set(struct socket *so, int val)
4458{
4459
4460 so->so_error = val;
4461}
4462
4463int
4464so_linger_get(const struct socket *so)
4465{
4466
4467 return (so->so_linger);
4468}
4469
4470void
4471so_linger_set(struct socket *so, int val)
4472{
4473
4474 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4475 ("%s: val %d out of range", __func__, val));
4476
4477 so->so_linger = val;
4478}
4479
4480struct protosw *
4481so_protosw_get(const struct socket *so)
4482{
4483
4484 return (so->so_proto);
4485}
4486
4487void
4488so_protosw_set(struct socket *so, struct protosw *val)
4489{
4490
4491 so->so_proto = val;
4492}
4493
4494void
4495so_sorwakeup(struct socket *so)
4496{
4497
4498 sorwakeup(so);
4499}
4500
4501void
4502so_sowwakeup(struct socket *so)
4503{
4504
4505 sowwakeup(so);
4506}
4507
4508void
4509so_sorwakeup_locked(struct socket *so)
4510{
4511
4512 sorwakeup_locked(so);
4513}
4514
4515void
4516so_sowwakeup_locked(struct socket *so)
4517{
4518
4519 sowwakeup_locked(so);
4520}
4521
4522void
4523so_lock(struct socket *so)
4524{
4525
4526 SOCK_LOCK(so);
4527}
4528
4529void
4530so_unlock(struct socket *so)
4531{
4532
4533 SOCK_UNLOCK(so);
4534}
const struct cf_level * level
Definition: cpufreq_if.m:45
int * count
Definition: cpufreq_if.m:63
device_property_type_t type
Definition: bus_if.m:941
void funsetown(struct sigio **sigiop)
void knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:2467
void knlist_init(struct knlist *knl, void *lock, void(*kl_lock)(void *), void(*kl_unlock)(void *), void(*kl_assert_lock)(void *, int))
Definition: kern_event.c:2536
void knlist_add(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:2420
void knlist_destroy(struct knlist *knl)
Definition: kern_event.c:2589
void knote(struct knlist *list, long hint, int lockflags)
Definition: kern_event.c:2363
int knlist_empty(struct knlist *knl)
Definition: kern_event.c:2474
int hhook_head_deregister(struct hhook_head *hhh)
Definition: kern_hhook.c:369
int hhook_head_register(int32_t hhook_type, int32_t hhook_id, struct hhook_head **hhh, uint32_t flags)
Definition: kern_hhook.c:294
int prison_check_af(struct ucred *cred, int af)
Definition: kern_jail.c:3274
int khelp_init_osd(uint32_t classes, struct osd *hosd)
Definition: kern_khelp.c:151
int khelp_destroy_osd(struct osd *hosd)
Definition: kern_khelp.c:188
void *() malloc(size_t size, struct malloc_type *mtp, int flags)
Definition: kern_malloc.c:632
void m_freem(struct mbuf *mb)
Definition: kern_mbuf.c:1587
static struct pollrec pr[POLL_LIST_LEN]
Definition: kern_poll.c:261
struct ucred * crhold(struct ucred *cr)
Definition: kern_prot.c:2014
void crfree(struct ucred *cr)
Definition: kern_prot.c:2035
int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max)
void panic(const char *fmt,...)
void pgsigio(struct sigio **sigiop, int sig, int checkctty)
Definition: kern_sig.c:4041
void sx_destroy(struct sx *sx)
Definition: kern_sx.c:266
void wakeup(const void *ident)
Definition: kern_synch.c:349
void wakeup_one(const void *ident)
Definition: kern_synch.c:369
int sysctl_handle_int(SYSCTL_HANDLER_ARGS)
Definition: kern_sysctl.c:1644
int ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
Definition: kern_time.c:1081
uint32_t * data
Definition: msi_if.m:90
uint64_t * addr
Definition: msi_if.m:89
int maxfiles
Definition: subr_param.c:92
int hz
Definition: subr_param.c:85
int printf(const char *fmt,...)
Definition: subr_prf.c:397
void log(int level, const char *fmt,...)
Definition: subr_prf.c:314
int sbuf_finish(struct sbuf *s)
Definition: subr_sbuf.c:833
void sbuf_delete(struct sbuf *s)
Definition: subr_sbuf.c:898
int sbuf_printf(struct sbuf *s, const char *fmt,...)
Definition: subr_sbuf.c:739
int sbuf_bcat(struct sbuf *s, const void *buf, size_t len)
Definition: subr_sbuf.c:509
ssize_t sbuf_len(struct sbuf *s)
Definition: subr_sbuf.c:877
char * sbuf_data(struct sbuf *s)
Definition: subr_sbuf.c:862
void sbuf_clear(struct sbuf *s)
Definition: subr_sbuf.c:316
struct sbuf * sbuf_new(struct sbuf *s, char *buf, int length, int flags)
Definition: subr_sbuf.c:196
int sbuf_cat(struct sbuf *s, const char *str)
Definition: subr_sbuf.c:566
uint16_t flags
Definition: subr_stats.c:2
int uiomove(void *cp, int n, struct uio *uio)
Definition: subr_uio.c:195
void selwakeuppri(struct selinfo *sip, int pri)
Definition: sys_generic.c:1924
void seldrain(struct selinfo *sip)
Definition: sys_generic.c:1851
void selrecord(struct thread *selector, struct selinfo *sip)
Definition: sys_generic.c:1869
void soaio_snd(void *context, int pending)
Definition: sys_socket.c:744
void soaio_rcv(void *context, int pending)
Definition: sys_socket.c:735
int accept_filt_getopt(struct socket *so, struct sockopt *sopt)
Definition: uipc_accf.c:167
int accept_filt_setopt(struct socket *so, struct sockopt *sopt)
Definition: uipc_accf.c:195
struct protosw * pffindproto(int family, int protocol, int type)
Definition: uipc_domain.c:301
struct domain * pffinddomain(int family)
Definition: uipc_domain.c:274
struct protosw * pffindtype(int family, int type)
Definition: uipc_domain.c:285
bool ktls_permit_empty_frames(struct ktls_session *tls)
Definition: uipc_ktls.c:1800
void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
Definition: uipc_ktls.c:2330
struct mtx mtx
Definition: uipc_ktls.c:0
void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type)
Definition: uipc_ktls.c:1678
u_int m_length(struct mbuf *m0, struct mbuf **last)
Definition: uipc_mbuf.c:1445
int m_unmapped_uiomove(const struct mbuf *m, int m_off, struct uio *uio, int len)
Definition: uipc_mbuf.c:1914
void m_cat(struct mbuf *m, struct mbuf *n)
Definition: uipc_mbuf.c:762
struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait)
Definition: uipc_mbuf.c:506
int m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
Definition: uipc_mbuf.c:1970
int max_hdr
Definition: uipc_mbuf.c:122
struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
Definition: uipc_mbuf.c:1855
int sbwait(struct sockbuf *sb)
Definition: uipc_sockbuf.c:467
void socantsendmore_locked(struct socket *so)
Definition: uipc_sockbuf.c:396
int sbsetopt(struct socket *so, int cmd, u_long cc)
Definition: uipc_sockbuf.c:634
void sbfree(struct sockbuf *sb, struct mbuf *m)
Definition: uipc_sockbuf.c:296
int soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
Definition: uipc_sockbuf.c:556
void sbdestroy(struct sockbuf *sb, struct socket *so)
Definition: uipc_sockbuf.c:742
void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
void socantrcvmore_locked(struct socket *so)
Definition: uipc_sockbuf.c:416
void sbdroprecord_locked(struct sockbuf *sb)
struct mbuf * sbcut_locked(struct sockbuf *sb, int len)
void sbdrop_locked(struct sockbuf *sb, int len)
void sbrelease_internal(struct sockbuf *sb, struct socket *so)
Definition: uipc_sockbuf.c:714
void socantrcvmore(struct socket *so)
Definition: uipc_sockbuf.c:431
int sosetopt(struct socket *so, struct sockopt *sopt)
Definition: uipc_socket.c:3058
void so_protosw_set(struct socket *so, struct protosw *val)
Definition: uipc_socket.c:4488
int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:3734
int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
Definition: uipc_socket.c:1602
int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
Definition: uipc_socket.c:3807
static void so_wrknl_lock(void *)
Definition: uipc_socket.c:4328
static struct filterops sowrite_filtops
Definition: uipc_socket.c:187
int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td)
Definition: uipc_socket.c:3875
void sofree(struct socket *so)
Definition: uipc_socket.c:1120
void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
Definition: uipc_socket.c:4279
int solisten(struct socket *so, int backlog, struct thread *td)
Definition: uipc_socket.c:894
static struct socket * soalloc(struct vnet *vnet)
Definition: uipc_socket.c:397
int solisten_dequeue(struct socket *head, struct socket **ret, int flags)
Definition: uipc_socket.c:1052
static struct timeval overinterval
Definition: uipc_socket.c:582
int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
Definition: uipc_socket.c:3786
int soopt_getm(struct sockopt *sopt, struct mbuf **mp)
Definition: uipc_socket.c:3474
int sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:860
fo_kqfilter_t soo_kqfilter
Definition: uipc_socket.c:180
static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
Definition: uipc_socket.c:1885
#define VNET_SO_ASSERT(so)
Definition: uipc_socket.c:203
static void so_wrknl_assert_lock(void *, int)
Definition: uipc_socket.c:4350
static int numopensockets
Definition: uipc_socket.c:250
int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
Definition: uipc_socket.c:3793
void so_sowwakeup_locked(struct socket *so)
Definition: uipc_socket.c:4516
VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL)
static int filt_soread(struct knote *kn, long hint)
Definition: uipc_socket.c:3896
int solisten_proto_check(struct socket *so)
Definition: uipc_socket.c:909
int pru_shutdown_notsupp(struct socket *so)
Definition: uipc_socket.c:3845
void sorflush(struct socket *so)
Definition: uipc_socket.c:2931
struct socket * sonewconn(struct socket *head, int connstatus)
Definition: uipc_socket.c:597
int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
Definition: uipc_socket.c:2453
static int hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
Definition: uipc_socket.c:2990
struct protosw * so_protosw_get(const struct socket *so)
Definition: uipc_socket.c:4481
struct sockbuf * so_sockbuf_snd(struct socket *so)
Definition: uipc_socket.c:4415
int sodisconnect(struct socket *so)
Definition: uipc_socket.c:1423
MALLOC_DEFINE(M_SONAME, "soname", "socket name")
int pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
Definition: uipc_socket.c:3727
int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
Definition: uipc_socket.c:3720
static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
Definition: uipc_socket.c:219
void so_linger_set(struct socket *so, int val)
Definition: uipc_socket.c:4471
static void so_wrknl_unlock(void *)
Definition: uipc_socket.c:4339
void soisdisconnected(struct socket *so)
Definition: uipc_socket.c:4141
void so_state_set(struct socket *so, int val)
Definition: uipc_socket.c:4429
int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
Definition: uipc_socket.c:3550
static void so_rdknl_assert_lock(void *, int)
Definition: uipc_socket.c:4310
static int filt_soempty(struct knote *kn, long hint)
Definition: uipc_socket.c:3977
void solisten_proto_abort(struct socket *so)
Definition: uipc_socket.c:945
static struct mtx so_global_mtx
Definition: uipc_socket.c:265
int maxsockets
Definition: uipc_socket.c:280
int pru_rcvd_notsupp(struct socket *so, int flags)
Definition: uipc_socket.c:3800
int pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
Definition: uipc_socket.c:3713
static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
Definition: uipc_socket.c:361
static struct filterops soread_filtops
Definition: uipc_socket.c:182
void solisten_wakeup(struct socket *sol)
Definition: uipc_socket.c:1027
int soconnect2(struct socket *so1, struct socket *so2)
Definition: uipc_socket.c:1412
int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td)
Definition: uipc_socket.c:507
int pru_sense_null(struct socket *so, struct stat *sb)
Definition: uipc_socket.c:3837
void so_sorwakeup_locked(struct socket *so)
Definition: uipc_socket.c:4509
int so_options_get(const struct socket *so)
Definition: uipc_socket.c:4436
int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
Definition: uipc_socket.c:3859
int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen)
Definition: uipc_socket.c:3043
static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
Definition: uipc_socket.c:1918
int so_linger_get(const struct socket *so)
Definition: uipc_socket.c:4464
int pru_connect2_notsupp(struct socket *so1, struct socket *so2)
Definition: uipc_socket.c:3764
int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
Definition: uipc_socket.c:3826
int soclose(struct socket *so)
Definition: uipc_socket.c:1239
int sogetopt(struct socket *so, struct sockopt *sopt)
Definition: uipc_socket.c:3292
int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
Definition: uipc_socket.c:3852
int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
Definition: uipc_socket.c:1437
VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST+1])
static void so_rdknl_lock(void *)
Definition: uipc_socket.c:4288
void so_unlock(struct socket *so)
Definition: uipc_socket.c:4530
int so_error_get(const struct socket *so)
Definition: uipc_socket.c:4450
SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW|CTLFLAG_MPSAFE, 0, "IPC")
int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:871
void soisconnected(struct socket *so)
Definition: uipc_socket.c:4049
int pru_disconnect_notsupp(struct socket *so)
Definition: uipc_socket.c:3779
void soabort(struct socket *so)
Definition: uipc_socket.c:1334
int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:1371
int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td)
Definition: uipc_socket.c:3814
int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:1378
int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
Definition: uipc_socket.c:3867
__FBSDID("$FreeBSD$")
MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF)
void so_error_set(struct socket *so, int val)
Definition: uipc_socket.c:4457
int so_state_get(const struct socket *so)
Definition: uipc_socket.c:4422
static uma_zone_t socket_zone
Definition: uipc_socket.c:279
int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
Definition: uipc_socket.c:3771
void so_lock(struct socket *so)
Definition: uipc_socket.c:4523
void sotoxsocket(struct socket *so, struct xsocket *xso)
Definition: uipc_socket.c:4376
int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
Definition: uipc_socket.c:3015
SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets")
void sodtor_set(struct socket *so, so_dtor_t *func)
Definition: uipc_socket.c:4220
int socheckuid(struct socket *so, uid_t uid)
Definition: uipc_socket.c:3996
int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td)
Definition: uipc_socket.c:3610
int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:3756
int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
Definition: uipc_socket.c:1961
SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows")
static void filt_sowdetach(struct knote *kn)
Definition: uipc_socket.c:3934
static struct filterops soempty_filtops
Definition: uipc_socket.c:192
void sohasoutofband(struct socket *so)
Definition: uipc_socket.c:3588
static void socket_init(void *tag)
Definition: uipc_socket.c:308
static void so_rdknl_unlock(void *)
Definition: uipc_socket.c:4299
static void filt_sordetach(struct knote *kn)
Definition: uipc_socket.c:3883
void so_options_set(struct socket *so, int val)
Definition: uipc_socket.c:4443
void soisdisconnecting(struct socket *so)
Definition: uipc_socket.c:4123
static void socket_vnet_uninit(const void *unused __unused)
Definition: uipc_socket.c:333
void so_sowwakeup(struct socket *so)
Definition: uipc_socket.c:4502
void sorele_locked(struct socket *so)
Definition: uipc_socket.c:1221
void soisconnecting(struct socket *so)
Definition: uipc_socket.c:4039
int soshutdown(struct socket *so, int how)
Definition: uipc_socket.c:2872
SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL)
int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
Definition: uipc_socket.c:2859
struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags)
Definition: uipc_socket.c:4206
void so_sorwakeup(struct socket *so)
Definition: uipc_socket.c:4495
static int filt_sowrite(struct knote *kn, long hint)
Definition: uipc_socket.c:3947
static void socket_hhook_register(int subtype)
Definition: uipc_socket.c:290
static void socket_hhook_deregister(int subtype)
Definition: uipc_socket.c:300
SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT|CTLFLAG_RW|CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size")
void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
Definition: uipc_socket.c:4231
static u_int somaxconn
Definition: uipc_socket.c:216
void solisten_proto(struct socket *so, int backlog)
Definition: uipc_socket.c:954
int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:3741
static void socket_zone_change(void *tag)
Definition: uipc_socket.c:283
int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
Definition: uipc_socket.c:3749
int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
Definition: uipc_socket.c:2677
int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td)
Definition: uipc_socket.c:3597
static void init_maxsockets(void *ignored)
Definition: uipc_socket.c:348
int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
Definition: uipc_socket.c:3265
int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
Definition: uipc_socket.c:3522
static void sodealloc(struct socket *so)
Definition: uipc_socket.c:461
static void socket_vnet_init(const void *unused __unused)
Definition: uipc_socket.c:321
int soaccept(struct socket *so, struct sockaddr **nam)
Definition: uipc_socket.c:1355
#define V_socket_hhh
Definition: uipc_socket.c:208
void soupcall_clear(struct socket *so, int which)
Definition: uipc_socket.c:4254
int soiolock(struct socket *so, struct sx *sx, int flags)
Definition: uipc_socket.c:4170
struct sockbuf * so_sockbuf_rcv(struct socket *so)
Definition: uipc_socket.c:4408
int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
Definition: uipc_socket.c:1864
so_gen_t so_gencnt
Definition: uipc_socket.c:198
struct mtx accept_mtx
Definition: uipc_socket.c:258
VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL)
void soiounlock(struct sx *sx)
Definition: uipc_socket.c:4197
struct stat * buf
int fd