本文共 9140 字,大约阅读时间需要 30 分钟。
整体理解:根据优先级权重分享调度周期的时间
公式: sched_slice = (task_load / rq_load_total) * sched_latencysched_latency 在linux上面是有配置i的,当任务小于8个的时候,使用sched_latency , 如果大于8个的话, rq上面的任务个数*sched_granurity.
ubuntu 上面的配置如下:
调度周期 sched_latency,即调度队列中所有任务时间片执行一次的时间对应的函数代码:
680 /* 681 * We calculate the wall-time slice from the period by taking a part 682 * proportional to the weight. 683 * 684 * s = p*P[w/rw] 685 */ 686 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 687 { 688 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); 689 690 for_each_sched_entity(se) { 691 struct load_weight *load; 692 struct load_weight lw; 693 694 cfs_rq = cfs_rq_of(se); 695 load = &cfs_rq->load; 696 697 if (unlikely(!se->on_rq)) { 698 lw = cfs_rq->load; 699 700 update_load_add(&lw, se->load.weight); 701 load = &lw; 702 } 703 slice = __calc_delta(slice, se->load.weight, load); 704 } 705 return slice; 706 }
第688行计算调度周期
206 /* 207 * delta_exec * weight / lw.weight 208 * OR 209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 210 * 211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case 212 * we're guaranteed shift stays positive because inv_weight is guaranteed to 213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. 214 * 215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus 216 * weight/lw.weight <= 1, and therefore our shift will also be positive. 217 */ 218 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 219 { 220 u64 fact = scale_load_down(weight); 221 int shift = WMULT_SHIFT; 222 223 __update_inv_weight(lw); 224 225 if (unlikely(fact >> 32)) { 226 while (fact >> 32) { 227 fact >>= 1; 228 shift--; 229 } 230 } 231 232 /* hint to use a 32x32->64 mul */ 233 fact = (u64)(u32)fact * lw->inv_weight; 234 235 while (fact >> 32) { 236 fact >>= 1; 237 shift--; 238 } 239 240 return mul_u64_u32_shr(delta_exec, fact, shift); 241 }
上图中的for_each_sched_entity 循环是调度实体可能是cgroup 的组调度实体,需要按照cgroup层次结构进行处理
注意: 由于任务的负载以及cpu的负载是在动态刷新的,所以在每个tick里面会进行任务片的时间的重新计算,同时判断执行时间是否超过时间片了。
调度参数影响分析以及个人理解:
任务的vruntime = (nice0_load/任务优先级权重 )*任务的实际运行时间
注意: cfs正是通过这个公式体现cfs的公平性的 公式理解:优先级越高的任务,vruntime增加的越少,实际上是因为 优先级高分配的时间片越高,这里vruntime增幅变小,这样让任务在vruntime的虚拟世界里面,所有任务的vruntime增加幅度都是一样的,通过这种方式做到调度的公平性代码:
708 /* 709 * We calculate the vruntime slice of a to-be-inserted task. 710 * 711 * vs = s/w 712 */ 713 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) 714 { 715 return calc_delta_fair(sched_slice(cfs_rq, se), se); 716 }
653 /* 654 * delta /= w 655 */ 656 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) 657 { 658 if (unlikely(se->load.weight != NICE_0_LOAD)) 659 delta = __calc_delta(delta, NICE_0_LOAD, &se->load); 660 661 return delta; 662 } 663
cfs的公平性详细见下一节的分析
通过上面两节, 我们一个看下vruntime是如何增长的:
sched_slice = (task_load / rq_load_total) * sched_latency 任务的vruntime = (nice0_load/任务优先级权重 )*任务的实际运行时间 上面任务的实际运行时间即为sched_slice两个公式合并,可以看到 任务vruntim = (nice0_load / rq_load_total) * sched_latency, 也即在虚拟世界里面每个任务的vruntime增长量都是一样的,这样保证了每个任务被调度机会的公平。
在任务被唤醒, 即try_to_wake_up函数的最后会调用 check_preempt_wakeup (函数调用关系 ttwu_queue-> … -> ttwu_do_wakeup-> check_preempt_curr-> check_preempt_curr(check_preempt_wakeup))
check_preempt_tick, 在tick会被调用
4095 * Preempt the current task with a newly woken task if needed: 4096 */ 4097 static void 4098 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 4099 { 4100 unsigned long ideal_runtime, delta_exec; 4101 struct sched_entity *se; 4102 s64 delta; 4103 4104 ideal_runtime = sched_slice(cfs_rq, curr); 4105 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 4106 if (delta_exec > ideal_runtime) { 4107 resched_curr(rq_of(cfs_rq)); 4108 /* 4109 * The current task ran long enough, ensure it doesn't get 4110 * re-elected due to buddy favours. 4111 */ 4112 clear_buddies(cfs_rq, curr); 4113 return; 4114 } 4115 4116 /* 4117 * Ensure that a task that missed wakeup preemption by a 4118 * narrow margin doesn't have to wait for a full slice. 4119 * This also mitigates buddy induced latencies under load. 4120 */ 4121 if (delta_exec < sysctl_sched_min_granularity) 4122 return; 4123 4124 se = __pick_first_entity(cfs_rq); 4125 delta = curr->vruntime - se->vruntime; 4126 4127 if (delta < 0) 4128 return; 4129 4130 if (delta > ideal_runtime) 4131 resched_curr(rq_of(cfs_rq)); 4132 }
关于时间补偿的文章,理解上有错,但是引出了一篇好的分享文章:Linux进程管理之CFS调度器分析
https://blog.csdn.net/ustc_dylan/article/details/4140245调度指标如何解决
任务组织: runqueue_head , init_task
p->counter 初始化时候的值
时间片减少地方,为0则置位need_resched 标记位(新内核标记为挪动到thread_info上面)
counter 赋值方法:参考schedule函数里面的赋值, fork里面也可以参考
#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)优先级计算函数:
goodnessint weight; /* * select the current process after every other * runnable process, but before the idle thread. * Also, dont trigger a counter recalculation. */ weight = -1; if (p->policy & SCHED_YIELD) goto out; /* * Non-RT process - normal case first. */ if (p->policy == SCHED_OTHER) { /* * Give the process a first-approximation goodness value * according to the number of clock-ticks it has left. * * Don't do any other calculations if the time slice is * over.. */ weight = p->counter; if (!weight) goto out;#ifdef CONFIG_SMP /* Give a largish advantage to the same processor... */ /* (this is equivalent to penalizing other processors) */ if (p->processor == this_cpu) weight += PROC_CHANGE_PENALTY;#endif /* .. and a slight advantage to the current MM */ if (p->mm == this_mm || !p->mm) weight += 1; weight += 20 - p->nice; goto out; } /* * Realtime process, select the first one on the * runqueue (taking priorities within processes * into account). */ weight = 1000 + p->rt_priority;out: return weight;}
第28行 weight += 20 - p->nice; 对有剩余时间片的任务的奖励,体现调度执行的公平性
场景处理:短作业任务,多级反馈队列(动态调整优先级),公平性
调度指标:吞吐量,响应时间,周转时间,公平性,实时性,能耗,资源利用率,任务完成时间。
优缺点:
优点:不用考虑负载均衡 缺点:多核场景一个队列成为瓶颈。 时间片过长,造成任务响应不及时。奖惩制度 effective_prio
普通进程的动态优先级计算没有那么简单,除了静态优先级,还需要考虑进程的平均睡眠时间(保存在进程描述符的sleep_avg成员中),并根据该值对进程进行奖惩。交互式进程的判断方法:
/* * If a task is 'interactive' then we reinsert it in the active * array after it has expired its current timeslice. (it will not * continue to run immediately, it will still roundrobin with * other interactive tasks.) * * This part scales the interactivity limit depending on niceness. * * We scale it linearly, offset by the INTERACTIVE_DELTA delta. * Here are a few examples of different nice levels: * * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the * task is rated interactive.) * * Ie. nice +19 tasks can never get 'interactive' enough to be * reinserted into the active array. And only heavily CPU-hog nice -20 * tasks will be expired. Default nice 0 tasks are somewhere between, * it takes some effort for them to get interactive, but it's not * too hard. */#define CURRENT_BONUS(p) \ (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG)
现代操作系统原理与实现
O(n)、O(1)和CFS调度器
http://www.wowotech.net/process_management/scheduler-history.htmlnext_buddy last_buddy
http://blog.chinaunix.net/uid-27767798-id-3548384.htmlLinux进程管理之CFS调度器分析
https://blog.csdn.net/lcw_202/article/details/6033346