43#include "opt_hwpmc_hooks.h"
49#include <sys/kernel.h>
51#include <sys/limits.h>
55#include <sys/resource.h>
56#include <sys/resourcevar.h>
61#include <sys/sysctl.h>
62#include <sys/sysproto.h>
63#include <sys/turnstile.h>
64#include <sys/umtxvar.h>
65#include <sys/vmmeter.h>
66#include <sys/cpuset.h>
70#include <sys/pmckern.h>
74#include <sys/dtrace_bsd.h>
76dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
79#include <machine/cpu.h>
80#include <machine/smp.h>
84#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
85#define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
86#define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
108#define TSF_BOUND 0x0001
109#define TSF_XFERABLE 0x0002
111#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
112#define THREAD_CAN_SCHED(td, cpu) \
113 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
116 sizeof(
struct thread0_storage),
117 "increase struct thread0_storage.t0st_sched size");
126#define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
127#define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
128#define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
130#define PRI_MIN_INTERACT PRI_MIN_TIMESHARE
131#define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
132#define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
133#define PRI_MAX_BATCH PRI_MAX_TIMESHARE
145#define SCHED_TICK_SECS 10
146#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
147#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
148#define SCHED_TICK_SHIFT 10
149#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
150#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
164#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
165#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
166#define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF)
167#define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF)
168#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
169#define SCHED_PRI_TICKS(ts) \
170 (SCHED_TICK_HZ((ts)) / \
171 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
172#define SCHED_PRI_NICE(nice) (nice)
186#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
187#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
188#define SCHED_INTERACT_MAX (100)
189#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
190#define SCHED_INTERACT_THRESH (30)
195#define SCHED_SLICE_DEFAULT_DIVISOR 10
196#define SCHED_SLICE_MIN_DIVISOR 6
199#define TDF_PICKCPU TDF_SCHED0
200#define TDF_SLICEEND TDF_SCHED2
216#ifdef FULL_PREEMPTION
268#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
269#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity))
274static int rebalance = 1;
275static int balance_interval = 128;
286static int balance_ticks;
290#define TDQ_SELF() ((struct tdq *)PCPU_GET(sched))
291#define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq))
292#define TDQ_ID(x) ((x)->tdq_id)
297#define TDQ_SELF() (&tdq_cpu)
298#define TDQ_CPU(x) (&tdq_cpu)
301#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
302#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
303#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
304#define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t)))
305#define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f))
306#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
307#define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock))
326static void tdq_add(
struct tdq *,
struct thread *,
int);
328static struct thread *tdq_move(
struct tdq *,
struct tdq *);
329static int tdq_idled(
struct tdq *);
330static void tdq_notify(
struct tdq *,
struct thread *);
331static struct thread *tdq_steal(
struct tdq *,
int);
332static struct thread *runq_steal(
struct runq *,
int);
333static int sched_pickcpu(
struct thread *,
int);
334static void sched_balance(
void);
335static int sched_balance_pair(
struct tdq *,
struct tdq *);
336static inline struct tdq *sched_setcpu(
struct thread *,
int,
int);
338static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
339static int sysctl_kern_sched_topology_spec_internal(
struct sbuf *sb,
340 struct cpu_group *cg,
int indent);
353 "struct proc *",
"uint8_t");
355 "struct proc *",
"void *");
357 "struct proc *",
"void *",
"int");
359 "struct proc *",
"uint8_t",
"struct thread *");
380 for (i = 0; i < RQB_LEN; i++) {
381 printf(
"\t\trunq bits %d 0x%zx\n",
382 i, rq->rq_status.rqb_bits[i]);
383 for (j = 0; j < RQB_BPW; j++)
384 if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
385 pri = j + (i << RQB_L2BPW);
386 rqh = &rq->rq_queues[pri];
387 TAILQ_FOREACH(td, rqh, td_runq) {
388 printf(
"\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
389 td, td->td_name, td->td_priority,
390 td->td_rqindex, pri);
416 printf(
"\trealtime runq:\n");
418 printf(
"\ttimeshare runq:\n");
436 if (cpri >= PRI_MIN_IDLE)
469 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
471 pri = td->td_priority;
472 ts = td_get_sched(td);
483 (
"Invalid priority %d on timeshare runq", pri));
488 if ((
flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
498 pri = (
unsigned char)(pri - 1) % RQ_NQS;
518 ts = td_get_sched(td);
520 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
521 KASSERT(
ts->ts_runq != NULL,
522 (
"tdq_runq_remove: thread %p null ts_runq", td));
525 ts->ts_flags &= ~TSF_XFERABLE;
545 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
548 if ((td->td_flags & TDF_NOLOAD) == 0)
550 KTR_COUNTER0(KTR_SCHED,
"load",
tdq->tdq_loadname,
tdq->
tdq_load);
563 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
565 (
"tdq_load_rem: Removing with 0 load on queue %d",
TDQ_ID(
tdq)));
568 if ((td->td_flags & TDF_NOLOAD) == 0)
570 KTR_COUNTER0(KTR_SCHED,
"load",
tdq->tdq_loadname,
tdq->
tdq_load);
611 if (td == NULL || td->td_priority > ctd->td_priority)
630 rndptr = DPCPU_PTR(randomval);
631 *rndptr = *rndptr * 69069 + 5;
633 return (*rndptr >> 16);
645struct cpu_search_res {
658cpu_search_lowest(
const struct cpu_group *cg,
const struct cpu_search *s,
659 struct cpu_search_res *r)
661 struct cpu_search_res lr;
663 int c, bload, l, load, p, total;
670 if (cg->cg_children > 0) {
671 for (c = cg->cg_children - 1; c >= 0; c--) {
672 load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
680 if (__predict_false(s->cs_running) &&
681 (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) &&
682 load >= 128 && (load & 128) != 0)
685 if (lr.csr_cpu >= 0 && (load < bload ||
686 (load == bload && lr.csr_load < r->csr_load))) {
688 r->csr_cpu = lr.csr_cpu;
689 r->csr_load = lr.csr_load;
696 for (c = cg->cg_last; c >= cg->cg_first; c--) {
697 if (!CPU_ISSET(c, &cg->cg_mask))
701 if (c == s->cs_prefer) {
702 if (__predict_false(s->cs_running))
716 (!s->cs_running || c != s->cs_prefer)) ||
717 !CPU_ISSET(c, s->cs_mask))
725 if (__predict_false(s->cs_running) && l > 0)
728 load -= sched_random() % 128;
729 if (bload > load - p) {
739cpu_search_highest(
const struct cpu_group *cg,
const struct cpu_search *s,
740 struct cpu_search_res *r)
742 struct cpu_search_res lr;
744 int c, bload, l, load, total;
751 if (cg->cg_children > 0) {
752 for (c = cg->cg_children - 1; c >= 0; c--) {
753 load = cpu_search_highest(&cg->cg_child[c], s, &lr);
755 if (lr.csr_cpu >= 0 && (load > bload ||
756 (load == bload && lr.csr_load > r->csr_load))) {
758 r->csr_cpu = lr.csr_cpu;
759 r->csr_load = lr.csr_load;
766 for (c = cg->cg_last; c >= cg->cg_first; c--) {
767 if (!CPU_ISSET(c, &cg->cg_mask))
778 !CPU_ISSET(c, s->cs_mask))
781 load -= sched_random() % 256;
797sched_lowest(
const struct cpu_group *cg, cpuset_t *
mask,
int pri,
int maxload,
798 int prefer,
int running)
801 struct cpu_search_res r;
803 s.cs_prefer = prefer;
804 s.cs_running = running;
808 cpu_search_lowest(cg, &s, &r);
816sched_highest(
const struct cpu_group *cg, cpuset_t *
mask,
int minload,
820 struct cpu_search_res r;
824 s.cs_trans = mintrans;
825 cpu_search_highest(cg, &s, &r);
830sched_balance_group(
struct cpu_group *cg)
834 cpuset_t hmask, lmask;
835 int high, low, anylow;
839 high = sched_highest(cg, &hmask, 1, 0);
843 CPU_CLR(high, &hmask);
844 CPU_COPY(&hmask, &lmask);
846 if (CPU_EMPTY(&lmask))
856 if ((td->td_flags & TDF_IDLETD) == 0 &&
860 ipi_cpu(high, IPI_AST);
869 low = sched_lowest(cg, &lmask, -1,
tdq->
tdq_load - 1, high, 1);
871 if (anylow && low == -1)
879 CPU_CLR(low, &hmask);
886 CPU_CLR(low, &lmask);
898 balance_ticks = max(balance_interval / 2, 1) +
899 (sched_random() % balance_interval);
902 sched_balance_group(cpu_top);
910tdq_lock_pair(
struct tdq *one,
struct tdq *two)
925tdq_unlock_pair(
struct tdq *one,
struct tdq *two)
935sched_balance_pair(
struct tdq *high,
struct tdq *low)
940 tdq_lock_pair(high, low);
946 (td = tdq_move(high, low)) != NULL) {
952 if (cpu != PCPU_GET(cpuid))
955 tdq_unlock_pair(high, low);
962static struct thread *
963tdq_move(
struct tdq *from,
struct tdq *to)
974 td = tdq_steal(
tdq, cpu);
986 td_get_sched(td)->ts_cpu = cpu;
999 struct cpu_group *cg, *
parent;
1002 int cpu, switchcnt, goup;
1007 CPU_CLR(PCPU_GET(cpuid), &
mask);
1011 cpu = sched_highest(cg, &
mask, steal_thresh, 1);
1033 if (
parent->cg_children == 2) {
1034 if (cg == &
parent->cg_child[0])
1035 cg = &
parent->cg_child[1];
1037 cg = &
parent->cg_child[0];
1052 if (steal->
tdq_load < steal_thresh ||
1068 CPU_CLR(cpu, &
mask);
1078 if (steal->
tdq_load < steal_thresh ||
1081 tdq_unlock_pair(
tdq, steal);
1087 if (tdq_move(steal,
tdq) != NULL)
1097 CPU_CLR(cpu, &
mask);
1098 tdq_unlock_pair(
tdq, steal);
1109tdq_notify(
struct tdq *
tdq,
struct thread *td)
1117 cpu = td_get_sched(td)->ts_cpu;
1118 pri = td->td_priority;
1128 atomic_thread_fence_seq_cst();
1130 if (TD_IS_IDLETHREAD(ctd)) {
1144 ipi_cpu(cpu, IPI_PREEMPT);
1151static struct thread *
1152runq_steal_from(
struct runq *rq,
int cpu, u_char
start)
1156 struct thread *td, *first;
1160 rqb = &rq->rq_status;
1161 bit =
start & (RQB_BPW -1);
1164 for (i = RQB_WORD(
start); i < RQB_LEN; bit = 0, i++) {
1165 if (rqb->rqb_bits[i] == 0)
1168 bit = RQB_FFS(rqb->rqb_bits[i]);
1169 for (; bit < RQB_BPW; bit++) {
1170 if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
1172 rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
1173 TAILQ_FOREACH(td, rqh, td_runq) {
1197static struct thread *
1198runq_steal(
struct runq *rq,
int cpu)
1206 rqb = &rq->rq_status;
1207 for (word = 0; word < RQB_LEN; word++) {
1208 if (rqb->rqb_bits[word] == 0)
1210 for (bit = 0; bit < RQB_BPW; bit++) {
1211 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1213 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1214 TAILQ_FOREACH(td, rqh, td_runq)
1226static struct thread *
1227tdq_steal(
struct tdq *
tdq,
int cpu)
1244static inline struct tdq *
1245sched_setcpu(
struct thread *td,
int cpu,
int flags)
1251 THREAD_LOCK_ASSERT(td, MA_OWNED);
1253 td_get_sched(td)->ts_cpu = cpu;
1258 KASSERT((
flags & SRQ_HOLD) == 0,
1259 (
"sched_setcpu: Invalid lock for SRQ_HOLD"));
1269 if ((
flags & SRQ_HOLD) == 0)
1270 mtx_unlock_spin(
mtx);
1277SCHED_STAT_DEFINE(pickcpu_intrbind,
"Soft interrupt binding");
1278SCHED_STAT_DEFINE(pickcpu_idle_affinity,
"Picked idle cpu based on affinity");
1279SCHED_STAT_DEFINE(pickcpu_affinity,
"Picked cpu based on affinity");
1280SCHED_STAT_DEFINE(pickcpu_lowest,
"Selected lowest load");
1281SCHED_STAT_DEFINE(pickcpu_local,
"Migrated to current cpu");
1282SCHED_STAT_DEFINE(pickcpu_migration,
"Selection may have caused migration");
1285sched_pickcpu(
struct thread *td,
int flags)
1287 struct cpu_group *cg, *ccg;
1291 int cpu, pri, r, self, intr;
1293 self = PCPU_GET(cpuid);
1294 ts = td_get_sched(td);
1295 KASSERT(!CPU_ABSENT(
ts->ts_cpu), (
"sched_pickcpu: Start scheduler on "
1296 "absent CPU %d for thread %s.",
ts->ts_cpu, td->td_name));
1303 return (
ts->ts_cpu);
1309 curthread->td_intr_nesting_level) {
1312 SCHED_STAT_INC(pickcpu_idle_affinity);
1330 SCHED_AFFINITY(
ts, CG_SHARE_L2)) {
1331 if (cg->cg_flags & CG_FLAG_THREAD) {
1333 for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
1334 if (CPU_ISSET(cpu, &cg->cg_mask) &&
1338 if (cpu > cg->cg_last) {
1339 SCHED_STAT_INC(pickcpu_idle_affinity);
1340 return (
ts->ts_cpu);
1343 SCHED_STAT_INC(pickcpu_idle_affinity);
1344 return (
ts->ts_cpu);
1353 for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
1354 if (cg->cg_flags & CG_FLAG_THREAD)
1356 if (cg->cg_children == 1 || cg->cg_count == 1)
1358 if (cg->cg_level == CG_SHARE_NONE ||
1359 (!intr && !SCHED_AFFINITY(
ts, cg->cg_level)))
1367 mask = &td->td_cpuset->cs_mask;
1368 pri = td->td_priority;
1369 r = TD_IS_RUNNING(td);
1375 if (ccg != NULL && intr) {
1376 cpu = sched_lowest(ccg,
mask, pri, INT_MAX,
ts->ts_cpu, r);
1378 SCHED_STAT_INC(pickcpu_intrbind);
1382 cpu = sched_lowest(ccg,
mask, max(pri, PRI_MAX_TIMESHARE),
1383 INT_MAX,
ts->ts_cpu, r);
1385 SCHED_STAT_INC(pickcpu_affinity);
1389 cpu = sched_lowest(cpu_top,
mask, pri, INT_MAX,
ts->ts_cpu, r);
1391 SCHED_STAT_INC(pickcpu_lowest);
1395 cpu = sched_lowest(cpu_top,
mask, -1, INT_MAX,
ts->ts_cpu, r);
1397 SCHED_STAT_INC(pickcpu_lowest);
1399 KASSERT(cpu >= 0, (
"sched_pickcpu: Failed to find a cpu."));
1400 KASSERT(!CPU_ABSENT(cpu), (
"sched_pickcpu: Picked absent CPU %d.", cpu));
1408 SCHED_STAT_INC(pickcpu_local);
1411 if (cpu !=
ts->ts_cpu)
1412 SCHED_STAT_INC(pickcpu_migration);
1420static struct thread *
1432 (
"tdq_choose: Invalid priority on timeshare queue %d",
1438 KASSERT(td->td_priority >= PRI_MIN_IDLE,
1439 (
"tdq_choose: Invalid priority on idle queue %d",
1455 printf(
"ULE: setup cpu %d\n",
id);
1471sched_setup_smp(
void)
1476 cpu_top = smp_topo();
1478 tdq = DPCPU_ID_PTR(i,
tdq);
1480 tdq->
tdq_cg = smp_topo_find(cpu_top, i);
1482 panic(
"Can't find cpu group for %d\n", i);
1483 DPCPU_ID_SET(i, randomval, i * 69069 + 5);
1485 PCPU_SET(sched, DPCPU_PTR(
tdq));
1547 balance_ticks = balance_interval;
1548 affinity = SCHED_AFFINITY_DEFAULT;
1582 ts = td_get_sched(td);
1589 ts->ts_runtime >=
ts->ts_slptime)
1592 if (
ts->ts_runtime >
ts->ts_slptime) {
1597 if (
ts->ts_slptime >
ts->ts_runtime) {
1599 return (
ts->ts_runtime / div);
1621 if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
1642 (
"sched_priority: invalid interactive priority %u score %u",
1651 (
"sched_priority: invalid priority %u: nice %d, "
1652 "ticks %d ftick %d ltick %d tick pri %d",
1653 pri, td->td_proc->p_nice, td_get_sched(td)->
ts_ticks,
1673 ts = td_get_sched(td);
1674 sum =
ts->ts_runtime +
ts->ts_slptime;
1683 if (
ts->ts_runtime >
ts->ts_slptime) {
1698 ts->ts_runtime /= 2;
1699 ts->ts_slptime /= 2;
1702 ts->ts_runtime = (
ts->ts_runtime / 5) * 4;
1703 ts->ts_slptime = (
ts->ts_slptime / 5) * 4;
1719 ts = td_get_sched(td);
1720 sum =
ts->ts_runtime +
ts->ts_slptime;
1723 ts->ts_runtime /= ratio;
1724 ts->ts_slptime /= ratio;
1739 ts0 = td_get_sched(&thread0);
1761 PCPU_SET(sched, DPCPU_PTR(
tdq));
1798 ts->ts_ticks = (
ts->ts_ticks / (
ts->ts_ltick -
ts->ts_ftick)) *
1819 KTR_POINT3(KTR_SCHED,
"thread",
sched_tdname(td),
"prio",
1820 "prio:%d", td->td_priority,
"new prio:%d", prio,
1822 SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
1823 if (td != curthread && prio < td->td_priority) {
1824 KTR_POINT3(KTR_SCHED,
"thread",
sched_tdname(curthread),
1825 "lend prio",
"prio:%d", td->td_priority,
"new prio:%d",
1827 SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
1830 ts = td_get_sched(td);
1831 THREAD_LOCK_ASSERT(td, MA_OWNED);
1832 if (td->td_priority == prio)
1840 if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1842 td->td_priority = prio;
1843 sched_add(td, SRQ_BORROWING | SRQ_HOLDTD);
1850 if (TD_IS_RUNNING(td)) {
1852 oldpri = td->td_priority;
1853 td->td_priority = prio;
1860 td->td_priority = prio;
1871 td->td_flags |= TDF_BORROWING;
1888 if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1889 td->td_base_pri <= PRI_MAX_TIMESHARE)
1890 base_pri = td->td_user_pri;
1892 base_pri = td->td_base_pri;
1893 if (prio >= base_pri) {
1894 td->td_flags &= ~TDF_BORROWING;
1909 td->td_base_pri = prio;
1915 if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1919 oldprio = td->td_priority;
1926 if (TD_ON_LOCK(td) && oldprio != prio)
1937 td->td_base_user_pri = prio;
1938 if (td->td_lend_user_pri <= prio)
1940 td->td_user_pri = prio;
1947 THREAD_LOCK_ASSERT(td, MA_OWNED);
1948 td->td_lend_user_pri = prio;
1949 td->td_user_pri = min(prio, td->td_base_user_pri);
1950 if (td->td_priority > td->td_user_pri)
1952 else if (td->td_priority != td->td_user_pri)
1953 td->td_flags |= TDF_NEEDRESCHED;
1963 if (td->td_lend_user_pri != prio)
1965 if (td->td_user_pri != min(prio, td->td_base_user_pri))
1967 if (td->td_priority != td->td_user_pri)
1983tdq_trysteal(
struct tdq *
tdq)
1985 struct cpu_group *cg, *
parent;
1990 if (
smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 ||
1994 CPU_CLR(PCPU_GET(cpuid), &
mask);
1998 for (i = 1, cg =
tdq->
tdq_cg, goup = 0; ; ) {
1999 cpu = sched_highest(cg, &
mask, steal_thresh, 1);
2020 if (++i > trysteal_limit) {
2029 if (
parent->cg_children == 2) {
2030 if (cg == &
parent->cg_child[0])
2031 cg = &
parent->cg_child[1];
2033 cg = &
parent->cg_child[0];
2046 if (steal->
tdq_load < steal_thresh ||
2064 if (steal->
tdq_load < steal_thresh ||
2074 if (tdq_move(steal,
tdq) == NULL) {
2095 (td_get_sched(td)->ts_flags &
TSF_BOUND) != 0,
2096 (
"Thread %p shouldn't migrate", td));
2097 KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), (
"sched_switch_migrate: "
2098 "thread %s queued on absent CPU %d.", td->td_name,
2099 td_get_sched(td)->ts_cpu));
2100 tdn =
TDQ_CPU(td_get_sched(td)->ts_cpu);
2111 tdq_notify(tdn, td);
2124 atomic_store_rel_ptr((
volatile uintptr_t *)&td->td_lock,
2137 struct thread *newtd;
2142 int cpuid, preempted;
2147 THREAD_LOCK_ASSERT(td, MA_OWNED);
2149 cpuid = PCPU_GET(cpuid);
2151 ts = td_get_sched(td);
2156 ts->ts_rltick =
ticks - affinity * MAX_CACHE_LEVELS;
2160 td->td_lastcpu = td->td_oncpu;
2162 (
flags & SW_PREEMPT) != 0;
2164 td->td_owepreempt = 0;
2166 if (!TD_IS_IDLETHREAD(td))
2174 if (TD_IS_IDLETHREAD(td)) {
2177 }
else if (TD_IS_RUNNING(td)) {
2179 srqflag = preempted ?
2180 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
2181 SRQ_OURSELF|SRQ_YIELDING;
2185 ts->ts_cpu = sched_pickcpu(td, 0);
2187 if (
ts->ts_cpu == cpuid)
2194 mtx_unlock_spin(
mtx);
2204#if (KTR_COMPILE & KTR_SCHED) != 0
2205 if (TD_IS_IDLETHREAD(td))
2206 KTR_STATE1(KTR_SCHED,
"thread",
sched_tdname(td),
"idle",
2207 "prio:%d", td->td_priority);
2209 KTR_STATE3(KTR_SCHED,
"thread",
sched_tdname(td), KTDSTATE(td),
2210 "prio:%d", td->td_priority,
"wmesg:\"%s\"", td->td_wmesg,
2211 "lockname:\"%s\"", td->td_lockname);
2229 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
2230 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
2232 SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
2240 if (dtrace_vtime_active)
2241 (*dtrace_vtime_switch_func)(newtd);
2243 td->td_oncpu = NOCPU;
2244 cpu_switch(td, newtd,
mtx);
2245 cpuid = td->td_oncpu = PCPU_GET(cpuid);
2247 SDT_PROBE0(sched, , , on__cpu);
2249 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
2250 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
2254 SDT_PROBE0(sched, , , remain__cpu);
2256 KASSERT(curthread->td_md.md_spinlock_count == 1,
2257 (
"invalid count %d", curthread->td_md.md_spinlock_count));
2259 KTR_STATE1(KTR_SCHED,
"thread",
sched_tdname(td),
"running",
2260 "prio:%d", td->td_priority);
2271 PROC_LOCK_ASSERT(p, MA_OWNED);
2274 FOREACH_THREAD_IN_PROC(p, td) {
2289 THREAD_LOCK_ASSERT(td, MA_OWNED);
2291 td->td_slptick =
ticks;
2292 if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
2293 td->td_flags |= TDF_CANSWAP;
2294 if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
2314 THREAD_LOCK_ASSERT(td, MA_OWNED);
2315 ts = td_get_sched(td);
2316 td->td_flags &= ~TDF_CANSWAP;
2322 slptick = td->td_slptick;
2324 if (slptick && slptick !=
ticks) {
2343 THREAD_LOCK_ASSERT(td, MA_OWNED);
2351 td_get_sched(td)->ts_runtime +=
tickincr;
2367 THREAD_LOCK_ASSERT(td, MA_OWNED);
2371 ts = td_get_sched(td);
2372 ts2 = td_get_sched(
child);
2373 child->td_oncpu = NOCPU;
2374 child->td_lastcpu = NOCPU;
2377 child->td_domain.dr_policy = td->td_cpuset->cs_domain;
2398 bzero(ts2->ts_name,
sizeof(ts2->ts_name));
2409 THREAD_LOCK_ASSERT(td, MA_OWNED);
2410 if (td->td_pri_class ==
class)
2412 td->td_pri_class =
class;
2424 "prio:%d",
child->td_priority);
2425 PROC_LOCK_ASSERT(p, MA_OWNED);
2426 td = FIRST_THREAD_IN_PROC(p);
2441 "prio:%d",
child->td_priority);
2448 td_get_sched(td)->ts_runtime += td_get_sched(
child)->ts_runtime;
2460 SDT_PROBE2(sched, , , surrender, td, td->td_proc);
2466 if (td->td_critnest == 1) {
2467 flags = SW_INVOL | SW_PREEMPT;
2468 flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
2474 td->td_owepreempt = 1;
2490 td->td_priority = td->td_user_pri;
2491 td->td_base_pri = td->td_user_pri;
2506 THREAD_LOCK_ASSERT(td, MA_OWNED);
2513 balance_ticks != 0) {
2514 balance_ticks -= cnt;
2515 if (balance_ticks <= 0)
2535 ts = td_get_sched(td);
2537 if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td))
2540 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
2545 td_get_sched(td)->ts_runtime +=
tickincr * cnt;
2554 ts->ts_slice += cnt;
2581 if ((curthread->td_flags & TDF_IDLETD) != 0) {
2612 return (PCPU_GET(idlethread));
2626 THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2629 pri = td->td_priority;
2630 cpri = ctd->td_priority;
2632 ctd->td_flags |= TDF_NEEDRESCHED;
2633 if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2637 ctd->td_owepreempt = 1;
2650 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
2651 KASSERT((td->td_inhibitors == 0),
2652 (
"sched_add: trying to run inhibited thread"));
2653 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2654 (
"sched_add: bad thread state"));
2655 KASSERT(td->td_flags & TDF_INMEM,
2656 (
"sched_add: thread swapped out"));
2678 KTR_STATE2(KTR_SCHED,
"thread",
sched_tdname(td),
"runq add",
2679 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2681 KTR_POINT1(KTR_SCHED,
"thread",
sched_tdname(curthread),
"wokeup",
2683 SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
2684 flags & SRQ_PREEMPTED);
2685 THREAD_LOCK_ASSERT(td, MA_OWNED);
2690 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2697 cpu = sched_pickcpu(td,
flags);
2698 tdq = sched_setcpu(td, cpu,
flags);
2700 if (cpu != PCPU_GET(cpuid))
2701 tdq_notify(
tdq, td);
2702 else if (!(
flags & SRQ_YIELDING))
2712 if ((
flags & SRQ_HOLD) != 0)
2718 if (!(
flags & SRQ_YIELDING))
2721 if (!(
flags & SRQ_HOLDTD))
2735 KTR_STATE1(KTR_SCHED,
"thread",
sched_tdname(td),
"runq rem",
2736 "prio:%d", td->td_priority);
2737 SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
2741 KASSERT(TD_ON_RUNQ(td),
2742 (
"sched_rem: thread not on run queue"));
2760 ts = td_get_sched(td);
2762 THREAD_LOCK_ASSERT(td, MA_OWNED);
2769 pctcpu = (FSCALE * ((FSCALE * rtick)/
hz)) >> FSHIFT;
2785 THREAD_LOCK_ASSERT(td, MA_OWNED);
2786 ts = td_get_sched(td);
2789 if (TD_ON_RUNQ(td)) {
2794 if (!TD_IS_RUNNING(td))
2801 td->td_flags |= TDF_NEEDRESCHED;
2802 if (td != curthread)
2803 ipi_cpu(
ts->ts_cpu, IPI_PREEMPT);
2815 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2816 KASSERT(td == curthread, (
"sched_bind: can only bind curthread"));
2817 ts = td_get_sched(td);
2823 if (PCPU_GET(cpuid) == cpu)
2839 THREAD_LOCK_ASSERT(td, MA_OWNED);
2840 KASSERT(td == curthread, (
"sched_unbind: can only bind curthread"));
2841 ts = td_get_sched(td);
2844 ts->ts_flags &= ~TSF_BOUND;
2851 THREAD_LOCK_ASSERT(td, MA_OWNED);
2877 total +=
TDQ_CPU(i)->tdq_sysload;
2887 return (
sizeof(
struct proc));
2893 return (
sizeof(
struct thread) +
sizeof(
struct td_sched));
2897#define TDQ_IDLESPIN(tdq) \
2898 ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
2900#define TDQ_IDLESPIN(tdq) 1
2911 int oldswitchcnt, switchcnt;
2914 mtx_assert(&
Giant, MA_NOTOWNED);
2917 THREAD_NO_SLEEPING();
2926 if (always_steal || switchcnt != oldswitchcnt) {
2927 oldswitchcnt = switchcnt;
2928 if (tdq_idled(
tdq) == 0)
2933 oldswitchcnt = switchcnt;
2952 if (
tdq->
tdq_load != 0 || switchcnt != oldswitchcnt)
2962 atomic_thread_fence_seq_cst();
2980 if (switchcnt != oldswitchcnt)
2993static struct thread *
2996 struct thread *newtd;
3001 KASSERT(curthread->td_md.md_spinlock_count == 1,
3002 (
"invalid count %d", curthread->td_md.md_spinlock_count));
3012 struct thread *newtd;
3024 PCPU_SET(switchticks,
ticks);
3029 cpu_throw(NULL, newtd);
3038 struct thread *newtd;
3044 THREAD_LOCK_ASSERT(td, MA_OWNED);
3048 td->td_lastcpu = td->td_oncpu;
3049 td->td_oncpu = NOCPU;
3072 KASSERT(curthread->td_md.md_spinlock_count == 1,
3073 (
"invalid count %d", curthread->td_md.md_spinlock_count));
3074 cpuid = PCPU_GET(cpuid);
3079 td->td_oncpu = cpuid;
3080 KTR_STATE1(KTR_SCHED,
"thread",
sched_tdname(td),
"running",
3081 "prio:%d", td->td_priority);
3082 SDT_PROBE0(sched, , , on__cpu);
3094 ts = td_get_sched(td);
3095 if (
ts->ts_name[0] ==
'\0')
3097 "%s tid %d", td->td_name, td->td_tid);
3098 return (
ts->ts_name);
3100 return (td->td_name);
3106sched_clear_tdname(
struct thread *td)
3110 ts = td_get_sched(td);
3111 ts->ts_name[0] =
'\0';
3122sysctl_kern_sched_topology_spec_internal(
struct sbuf *sb,
struct cpu_group *cg,
3125 char cpusetbuf[CPUSETBUFSIZ];
3128 sbuf_printf(sb,
"%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
3129 "", 1 + indent / 2, cg->cg_level);
3130 sbuf_printf(sb,
"%*s <cpu count=\"%d\" mask=\"%s\">", indent,
"",
3133 for (i = cg->cg_first; i <= cg->cg_last; i++) {
3134 if (CPU_ISSET(i, &cg->cg_mask)) {
3144 if (cg->cg_flags != 0) {
3146 if ((cg->cg_flags & CG_FLAG_HTT) != 0)
3147 sbuf_printf(sb,
"<flag name=\"HTT\">HTT group</flag>");
3148 if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
3149 sbuf_printf(sb,
"<flag name=\"THREAD\">THREAD group</flag>");
3150 if ((cg->cg_flags & CG_FLAG_SMT) != 0)
3151 sbuf_printf(sb,
"<flag name=\"SMT\">SMT group</flag>");
3152 if ((cg->cg_flags & CG_FLAG_NODE) != 0)
3153 sbuf_printf(sb,
"<flag name=\"NODE\">NUMA node</flag>");
3157 if (cg->cg_children > 0) {
3159 for (i = 0; i < cg->cg_children; i++)
3160 sysctl_kern_sched_topology_spec_internal(sb,
3161 &cg->cg_child[i], indent+2);
3173sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
3178 KASSERT(cpu_top != NULL, (
"cpu_top isn't initialized"));
3185 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
3200 int error, new_val, period;
3205 if (error != 0 || req->newptr == NULL)
3209 sched_slice = imax(1, (new_val + period / 2) / period);
3221 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
3223 "Quantum for timeshare threads in microseconds");
3225 "Quantum for timeshare threads in stathz ticks");
3227 "Interactivity score threshold");
3230 "Maximal (lowest) priority for preemption");
3232 "Assign static kernel priorities to sleeping threads");
3234 "Number of times idle thread will spin waiting for new work");
3237 "Threshold before we will permit idle thread spinning");
3239SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
3240 "Number of hz ticks to keep thread affinity for");
3241SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
3242 "Enables the long-term load balancer");
3243SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
3244 &balance_interval, 0,
3245 "Average period in stathz ticks to run the long-term balancer");
3246SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
3247 "Attempts to steal work from other cores before idling");
3248SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
3249 "Minimum load on remote CPU before we'll steal");
3250SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
3251 0,
"Topological distance limit for stealing threads in sched_switch()");
3252SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
3253 "Always run the stealer from the idle thread");
3254SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
3255 CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec,
"A",
3256 "XML dump of detected CPU topology");
3262 "Decay factor used for updating %CPU in 4BSD scheduler");
DPCPU_DEFINE_STATIC(int, pcputicks)
struct cpuset * cpuset_ref(struct cpuset *set)
char * cpusetobj_strprint(char *buf, const cpuset_t *set)
void thread_lock_unblock(struct thread *td, struct mtx *new)
void thread_lock_block_wait(struct thread *td)
void thread_lock_set(struct thread *td, struct mtx *new)
struct mtx __exclusive_cache_line Giant
struct mtx * thread_lock_block(struct thread *td)
void panic(const char *fmt,...)
void runq_add(struct runq *rq, struct thread *td, int flags)
struct thread * choosethread(void)
void runq_remove(struct runq *rq, struct thread *td)
void runq_init(struct runq *rq)
struct thread * runq_choose(struct runq *rq)
void runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
void runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
struct thread * runq_choose_from(struct runq *rq, u_char idx)
void mi_switch(int flags)
int sysctl_handle_int(SYSCTL_HANDLER_ARGS)
struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req)
struct thread * sched_choose(void)
void sched_bind(struct thread *td, int cpu)
fixpt_t sched_pctcpu(struct thread *td)
int sched_rr_interval(void)
void sched_idletd(void *dummy)
static int __read_mostly sched_idlespins
static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
void sched_exit_thread(struct thread *td, struct thread *child)
void sched_userret_slowpath(struct thread *td)
void sched_lend_user_prio(struct thread *td, u_char prio)
#define TDQ_IDLESPIN(tdq)
static int sched_shouldpreempt(int, int, int)
void sched_relinquish(struct thread *td)
static void thread_unblock_switch(struct thread *td, struct mtx *mtx)
SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW|CTLFLAG_MPSAFE, 0, "Scheduler")
static int __read_mostly static_boost
void sched_exit(struct proc *p, struct thread *child)
static int __read_mostly sched_slice
void sched_throw(struct thread *td)
static __inline void tdq_runq_add(struct tdq *, struct thread *, int)
static int __read_mostly sched_idlespinthresh
static void sched_interact_update(struct thread *)
static void sched_interact_fork(struct thread *)
#define SCHED_PRI_NICE(nice)
static int __read_mostly realstathz
#define SCHED_SLICE_DEFAULT_DIVISOR
void sched_unbind(struct thread *td)
SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
void sched_fork(struct thread *td, struct thread *child)
u_int sched_estcpu(struct thread *td __unused)
static int __read_mostly tickincr
void sched_fork_thread(struct thread *td, struct thread *child)
SDT_PROBE_DEFINE(sched,,, on__cpu)
static struct thread * sched_throw_grab(struct tdq *tdq)
#define TDQ_LOCK_FLAGS(t, f)
#define SCHED_PRI_TICKS(ts)
static struct tdq tdq_cpu
static void sched_pctcpu_update(struct td_sched *, int)
void sched_unlend_prio(struct thread *td, u_char prio)
static void sched_thread_priority(struct thread *, u_char)
SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name")
static void tdq_add(struct tdq *, struct thread *, int)
_Static_assert(sizeof(struct thread)+sizeof(struct td_sched)<=sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size")
void sched_rem(struct thread *td)
static u_int __read_mostly sched_interact
#define TDQ_TRYLOCK_FLAGS(t, f)
#define THREAD_CAN_SCHED(td, cpu)
static void sched_priority(struct thread *)
static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
#define SCHED_INTERACT_THRESH
void sched_class(struct thread *td, int class)
void sched_prio(struct thread *td, u_char prio)
#define SCHED_INTERACT_HALF
#define SCHED_TICK_HZ(ts)
static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
#define TDQ_LOCK_ASSERT(t, type)
SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds")
static int __read_mostly sched_slice_min
SDT_PROVIDER_DEFINE(sched)
void sched_ap_entry(void)
void sched_lend_user_prio_cond(struct thread *td, u_char prio)
void sched_preempt(struct thread *td)
void sched_clock(struct thread *td, int cnt)
static void tdq_load_add(struct tdq *, struct thread *)
char * sched_tdname(struct thread *td)
int sched_sizeof_proc(void)
static __inline void tdq_runq_rem(struct tdq *, struct thread *)
static void sched_setup(void *dummy)
int sched_sizeof_thread(void)
#define SCHED_SLP_RUN_FORK
void sched_nice(struct proc *p, int nice)
void sched_lend_prio(struct thread *td, u_char prio)
static void sched_initticks(void *dummy)
void sched_affinity(struct thread *td)
#define SCHED_SLP_RUN_MAX
static void tdq_setup(struct tdq *, int i)
SDT_PROBE_DEFINE2(sched,,, load__change, "int", "int")
SDT_PROBE_DEFINE3(sched,,, change__pri, "struct thread *", "struct proc *", "uint8_t")
void sched_fork_exit(struct thread *td)
#define SCHED_SLICE_MIN_DIVISOR
void sched_add(struct thread *td, int flags)
SDT_PROBE_DEFINE4(sched,,, enqueue, "struct thread *", "struct proc *", "void *", "int")
#define THREAD_CAN_MIGRATE(td)
static struct thread * tdq_choose(struct tdq *)
void sched_sleep(struct thread *td, int prio)
static int tdq_slice(struct tdq *tdq)
static void sched_setpreempt(struct thread *td)
static void tdq_load_rem(struct tdq *, struct thread *)
static int sched_interact_score(struct thread *)
void sched_switch(struct thread *td, int flags)
SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks")
void sched_wakeup(struct thread *td, int srqflags)
SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold")
static void runq_print(struct runq *rq)
static int __read_mostly preempt_thresh
int sched_is_bound(struct thread *td)
void sched_user_prio(struct thread *td, u_char prio)
volatile int tdq_cpu_idle
volatile short tdq_oldswitchcnt
struct runq tdq_timeshare
volatile int tdq_transferable
struct mtx_padalign tdq_lock
volatile short tdq_switchcnt
char tdq_name[TDQ_NAME_LEN]
struct cpu_group * tdq_cg
static bool kasan_enabled __read_mostly
struct pcpu * pcpu_find(u_int cpuid)
int printf(const char *fmt,...)
int snprintf(char *str, size_t size, const char *format,...)
int sbuf_finish(struct sbuf *s)
void sbuf_delete(struct sbuf *s)
int sbuf_printf(struct sbuf *s, const char *fmt,...)
void turnstile_adjust(struct thread *td, u_char oldpri)