32#include "opt_tcpdebug.h"
105#include <sys/param.h>
107#include <sys/interrupt.h>
108#include <sys/module.h>
109#include <sys/kernel.h>
110#include <sys/hhook.h>
111#include <sys/malloc.h>
114#include <sys/socket.h>
115#include <sys/socketvar.h>
116#include <sys/sysctl.h>
117#include <sys/systm.h>
118#include <sys/refcount.h>
119#include <sys/sched.h>
120#include <sys/queue.h>
122#include <sys/counter.h>
124#include <sys/kthread.h>
125#include <sys/kern_prefetch.h>
130#include <net/route.h>
134#include <net/netisr.h>
135#include <net/rss_config.h>
148#include <netinet6/in6_pcb.h>
149#include <netinet6/ip6_var.h>
178#define NUM_OF_HPTSI_SLOTS 102400
181#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
182#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx)
183#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx)
222 struct intr_event *
ie;
260SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
261 "TCP Hpts controls");
262SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
263 "TCP Hpts statistics");
265#define timersub(tvp, uvp, vvp) \
267 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
268 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
269 if ((vvp)->tv_usec < 0) { \
271 (vvp)->tv_usec += 1000000; \
292 "Number of times hpts could not catch up and was behind hopelessly");
297 &
hpts_loops,
"Number of times hpts had to loop to catch up");
307 &
combined_wheel_wrap,
"Number of times the wheel lagged enough to have an insert see wrap");
312 &
wheel_wrap,
"Number of times the wheel lagged enough to have an insert see wrap");
316 &
hpts_direct_call,
"Number of times hpts was called by syscall/trap or other entry");
321 &
hpts_wake_timeout,
"Number of times hpts threads woke up via the callout expiring");
331 &
hpts_back_tosleep,
"Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
337 &
cpu_uses_flowid,
"Number of times when setting cpuid we used the flowid field");
339 &
cpu_uses_random,
"Number of times when setting cpuid we used the a random value");
343SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
345 "Thread Binding tunable");
348 "Use of irq CPU tunable");
349SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
351 "Value for PRE() precision of callout");
352SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
354 "How many connections (below) make us use the callout based mechanism");
357 "Do we add to any tp that has logging on pacer logs");
358SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
360 "Do we start any hpts timer on the assigned cpu?");
361SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
363 "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
364SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
366 "What is the dynamic minsleep value?");
367SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
369 "What is the dynamic maxsleep value?");
378 "What is the maximum number of times the pacer will loop trying to catch up");
380#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
391 error = sysctl_handle_int(oidp, &
new, 0, req);
392 if (error == 0 && req->newptr) {
409 error = sysctl_handle_int(oidp, &
new, 0, req);
410 if (error == 0 && req->newptr) {
420 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
423 "Maximum time hpts will sleep");
426 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
429 "The minimum time the hpts must sleep before processing more slots");
435SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
437 "If we only process this many or less on a timeout, we need longer sleep on the next callout");
438SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
440 "If we process this many or more on a timeout, we need less sleep on the next callout");
441SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
443 "When we are over the threshold on the pacer do we prohibit wakeups?");
447 int slots_to_run,
int idx,
int from_callout)
528 TAILQ_INSERT_TAIL(&
hptsh->head, inp, inp_hpts);
549 bool released __diagused;
553 MPASS(released ==
false);
575 TAILQ_REMOVE(&
hptsh->head, inp, inp_hpts);
576 MPASS(
hptsh->count > 0);
589 TAILQ_FOREACH(tmp, &
hptsh->head, inp_hpts)
623 KASSERT(wheel_slot <
NUM_OF_HPTSI_SLOTS, (
"Invalid tick %u not on wheel", wheel_slot));
647 if (slot_now > prev_slot)
648 return (slot_now - prev_slot);
649 else if (slot_now == prev_slot)
675 uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
686 *target_slot = end_slot;
702 *target_slot = end_slot;
754 if (avail_on_wheel <= pacer_to_now) {
774 return (avail_on_wheel - pacer_to_now);
788 (
"hpts:%p inp:%p slot:%d > max",
798 int distance, yet_to_run;
805 KASSERT(yet_to_run <= distance,
806 (
"hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
808 distance, yet_to_run,
819 uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
820 int32_t wheel_slot, maxslots;
822 bool need_wakeup =
false;
836 memset(diag, 0,
sizeof(
struct hpts_diag));
902 }
else if (maxslots >= slot) {
916 check_if_slot_would_be_wrong(hpts, inp, inp->
inp_hptsslot, line);
945 (yet_to_sleep > slot)) {
963 diag->
co_ret = 0xffff0000;
965 }
else if (need_new_to) {
976 tv.tv_usec = need_new_to;
979 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
1039 if (cpuid == NETISR_CPUID_NONE)
1098 if (TAILQ_EMPTY(&hpts->
p_hptss[t].head) == 0) {
1104 KASSERT(fnd != 0, (
"Hpts:%p cnt:%d but none found", hpts, hpts->
p_on_queue_cnt));
1118 uint64_t total_slots_processed = 0;
1119 int32_t slots_to_run, i, error;
1120 int32_t paced_cnt = 0;
1121 int32_t loop_cnt = 0;
1122 int32_t did_prefetch = 0;
1123 int32_t prefetch_ninp = 0;
1124 int32_t prefetch_tp = 0;
1125 int32_t wrap_loop_cnt = 0;
1126 int32_t slot_pos_of_endpoint = 0;
1127 int32_t orig_exit_slot;
1128 int8_t completed_measure = 0, seen_endpoint = 0;
1227 for (i = 0; i < slots_to_run; i++) {
1228 struct inpcb *inp, *ninp;
1242 TAILQ_SWAP(&head, &
hptsh->head,
inpcb, inp_hpts);
1249 TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) {
1254 kern_prefetch(ninp, &prefetch_ninp);
1259 if (seen_endpoint == 0) {
1261 orig_exit_slot = slot_pos_of_endpoint =
1263 }
else if (completed_measure == 0) {
1265 orig_exit_slot = runningslot;
1267 total_slots_processed++;
1294 (
"Hpts:%p inp:%p slot mis-aligned %u vs %u",
1304 uint32_t maxslots, last_slot, remaining_slots;
1306 remaining_slots = slots_to_run - (i + 1);
1359 CURVNET_SET(inp->inp_vnet);
1362 tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
1366 kern_prefetch(tp->
t_fb_ptr, &did_prefetch);
1377 error = tcp_output(tp);
1413 kern_prefetch(ninp->
inp_ppcb, &prefetch_tp);
1420 if (seen_endpoint) {
1428 completed_measure = 1;
1466 if (seen_endpoint == 0) {
1470 if ((wrap_loop_cnt < 2) &&
1493 (wrap_loop_cnt >= 2) || (from_callout == 0)),
1494 (
"H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
1497 || (wrap_loop_cnt >= 2) || (from_callout == 0)),
1498 (
"H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
1530 mtx_unlock(&hpts->
p_mtx);
1542 if (mtx_trylock(&hpts->
p_mtx) == 0) {
1566 if (tv.tv_usec < hpts->
p_mysleep.tv_usec) {
1584 callout_reset_sbt_on(&hpts->co, sb, 0,
1598 mtx_unlock(&hpts->
p_mtx);
1605 uint32_t cts, time_since_ran, calc;
1629 if (calc > time_since_ran) {
1631 time_since_ran = calc;
1634 if (oldest_idx >= 0)
1645 struct epoch_tracker et;
1647 NET_EPOCH_ENTER(et);
1658 struct epoch_tracker et;
1664 mtx_lock(&hpts->
p_mtx);
1667 callout_stop(&hpts->co);
1672 if (callout_pending(&hpts->co) ||
1673 !callout_active(&hpts->co)) {
1674 mtx_unlock(&hpts->
p_mtx);
1678 callout_deactivate(&hpts->co);
1680 NET_EPOCH_ENTER(et);
1739 if (tv.tv_usec < hpts->
p_mysleep.tv_usec) {
1793 callout_reset_sbt_on(&hpts->co, sb, 0,
1797 mtx_unlock(&hpts->
p_mtx);
1805 int32_t i, j, error, bound = 0, created = 0;
1813 uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
1814 int count, domain, cpu;
1837 M_TCPHPTS, M_WAITOK | M_ZERO);
1839 M_TCPHPTS, M_WAITOK);
1846 mtx_init(&hpts->
p_mtx,
"tcp_hpts_lck",
1847 "hpts", MTX_DEF | MTX_DUPOK);
1849 TAILQ_INIT(&hpts->
p_hptss[j].head);
1854 sprintf(unit,
"%d", i);
1856 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
1859 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1863 OID_AUTO,
"out_qcnt", CTLFLAG_RD,
1865 "Count TCB's awaiting output processing");
1868 OID_AUTO,
"active", CTLFLAG_RD,
1870 "Is the hpts active");
1873 OID_AUTO,
"curslot", CTLFLAG_RD,
1875 "What the current running pacers goal");
1878 OID_AUTO,
"runtick", CTLFLAG_RD,
1880 "What the running pacers current slot is");
1883 OID_AUTO,
"curtick", CTLFLAG_RD,
1885 "What the running pacers last tick mapped to the wheel was");
1888 OID_AUTO,
"lastran", CTLFLAG_RD,
1890 "The last usec tick that this hpts ran");
1893 OID_AUTO,
"cur_min_sleep", CTLFLAG_RD,
1895 "What the running pacers is using for p_mysleep.tv_usec");
1898 OID_AUTO,
"now_sleeping", CTLFLAG_RD,
1900 "What the running pacers is actually sleeping for");
1903 OID_AUTO,
"syscall_cnt", CTLFLAG_RD,
1905 "How many times we had syscalls on this hpts");
1912 hpts->
p_cpu = 0xffff;
1914 callout_init(&hpts->co, 1);
1927 error = swi_add(&hpts->
ie,
"hpts",
1929 SWI_NET, INTR_MPSAFE, &hpts->
ie_cookie);
1931 (
"Can't add hpts:%p i:%d err:%d",
1934 hpts->p_mysleep.tv_sec = 0;
1937 if (intr_event_bind(hpts->ie, i) == 0)
1941 domain = pc->pc_domain;
1942 CPU_COPY(&cpuset_domain[domain], &cs);
1943 if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
1953 hpts->sleeping = tv.tv_usec;
1956 callout_reset_sbt_on(&hpts->co, sb, 0,
1964 for (i = 0; i < vm_ndomains; i++) {
1970 printf(
"TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
1974 printf(
"HPTS is in INVARIANT mode!!\n");
static TAILQ_HEAD(handler_chain, proto_handler)
void in_pcbref(struct inpcb *inp)
bool in_pcbrele_wlocked(struct inpcb *inp)
#define INP_LOCK_ASSERT(inp)
#define INP_WLOCK_ASSERT(inp)
#define INP_SUPPORTS_MBUFQ
struct socket * inp_socket
volatile uint16_t inp_hpts_cpu
uint32_t inp_hpts_request
volatile uint16_t inp_irq_cpu
int(* tfb_do_queued_segments)(struct socket *, struct tcpcb *, int)
struct tcp_hpts_entry::hptsh * p_hptss
struct callout co __aligned(CACHE_LINE_SIZE)
uint8_t p_hpts_wake_scheduled
struct sysctl_ctx_list hpts_ctx
uint32_t p_hpts_sleep_time
struct sysctl_oid * hpts_root
struct tcp_hpts_entry ** rp_ent
struct tcp_function_block * t_fb
static void __tcp_run_hpts(struct tcp_hpts_entry *hpts)
void tcp_hpts_remove(struct inpcb *inp)
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, &hpts_hopelessly_behind, "Number of times hpts could not catch up and was behind hopelessly")
static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
counter_u64_t hpts_back_tosleep
static uint32_t hpts_sleep_max
#define HPTS_UNLOCK(hpts)
counter_u64_t cpu_uses_flowid
counter_u64_t hpts_direct_awakening
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts")
counter_u64_t hpts_direct_call
int32_t tcp_min_hptsi_time
#define NUM_OF_HPTSI_SLOTS
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads)
#define HPTS_MAX_SLEEP_ALLOWED
static void tcp_init_hptsi(void *st)
static void tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
static int32_t hpts_uses_oldest
static int32_t max_pacer_loops
MODULE_VERSION(tcphpts, 1)
static int tcp_hpts_no_wake_over_thresh
static struct tcp_hpts_entry * tcp_hpts_lock(struct inpcb *inp)
static uint32_t * cts_last_ran
counter_u64_t hpts_hopelessly_behind
static void tcp_wakehpts(struct tcp_hpts_entry *hpts)
uint32_t p_hpts_sleep_time
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD, &tcp_bind_threads, 2, "Thread Binding tunable")
static void tcp_drop_in_pkts(struct tcpcb *tp)
static int ticks_indicate_more_sleep
void __tcp_set_hpts(struct inpcb *inp, int32_t line)
SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL)
static int hpts_use_assigned_cpu
uint16_t hpts_random_cpu(struct inpcb *inp)
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLTYPE_UINT|CTLFLAG_RW|CTLFLAG_NEEDGIANT, &hpts_sleep_max, 0, &sysctl_net_inet_tcp_hpts_max_sleep, "IU", "Maximum time hpts will sleep")
static int tcp_use_irq_cpu
static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, int from_callout)
static struct tcp_hpts_entry * tcp_choose_hpts_to_run()
static int tick_to_wheel(uint32_t cts_in_wticks)
static struct hpts_domain_info hpts_domains[MAXMEMDOM]
static void inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts)
static uint16_t hpts_cpuid(struct inpcb *inp, int *failed)
counter_u64_t back_tosleep
#define HPTS_MTX_ASSERT(hpts)
bool tcp_in_hpts(struct inpcb *inp)
counter_u64_t hpts_wake_timeout
counter_u64_t cpu_uses_random
uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
counter_u64_t combined_wheel_wrap
static int32_t dynamic_max_sleep
static int32_t max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
static int hpts_does_tp_logging
static int sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
static int conn_cnt_thresh
static int hpts_slots_diff(int prev_slot, int slot_now)
static struct tcp_hptsi tcp_pace
static int tcp_bind_threads
static void hpts_timeout_swi(void *arg)
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW|CTLFLAG_MPSAFE, 0, "TCP Hpts controls")
static void tcp_hpts_thread(void *ctx)
static void inp_hpts_release(struct inpcb *inp)
static int sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
static int32_t tcp_hpts_precision
static int32_t dynamic_min_sleep
static int hpts_slot(uint32_t wheel_slot, uint32_t plus)
static int ticks_indicate_less_sleep
struct tcp_hpts_entry __aligned(CACHE_LINE_SIZE)
#define DEFAULT_MIN_SLEEP
#define TICKS_INDICATE_MORE_SLEEP
#define HPTS_TICKS_PER_SLOT
#define DYNAMIC_MIN_SLEEP
#define DEFAULT_CONNECTION_THESHOLD
static __inline uint32_t tcp_tv_to_usectick(const struct timeval *sv)
#define DYNAMIC_MAX_SLEEP
static __inline uint32_t tcp_gethptstick(struct timeval *sv)
#define TICKS_INDICATE_LESS_SLEEP
#define LOWEST_SLEEP_ALLOWED
static __inline uint32_t tcp_get_usecs(struct timeval *tv)
static __inline uint32_t tcp_tv_to_hptstick(const struct timeval *sv)
#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv)