linux cpufreq interactive调频代码实现下

上一篇文章简单介绍了cpufreq初始化过程,最后会调用到cpufreq_init_policy,

在这里面会启动对应的governor。

static void cpufreq_init_policy(struct cpufreq_policy *policy)
{
	struct cpufreq_policy new_policy;
	int ret = 0;

	memcpy(&new_policy, policy, sizeof(*policy));
	/* assure that the starting sequence is run in cpufreq_set_policy */
	policy->governor = NULL;

	/* set default policy */
	ret = cpufreq_set_policy(policy, &new_policy);
	policy->user_policy.policy = policy->policy;
	policy->user_policy.governor = policy->governor;

	if (ret) {
		pr_debug("setting policy failed\n");
		if (cpufreq_driver->exit)
			cpufreq_driver->exit(policy);
	}
}



/*
 * policy : current policy.
 * new_policy: policy to be set.
 */
static int cpufreq_set_policy(struct cpufreq_policy *policy,
				struct cpufreq_policy *new_policy)
{
	int ret = 0, failed = 1;

	pr_debug("setting new policy for CPU %u: %u - %u kHz\n", new_policy->cpu,
		new_policy->min, new_policy->max);

	memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));

	if (new_policy->min > policy->user_policy.max
	    || new_policy->max < policy->user_policy.min) {
		ret = -EINVAL;
		goto error_out;
	}

	/* verify the cpu speed can be set within this limit */
	//检测policy对应的cpu 最大最小频率是否符合要求
	ret = cpufreq_driver->verify(new_policy);
	if (ret)
		goto error_out;

	/* adjust if necessary - all reasons */
	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
			CPUFREQ_ADJUST, new_policy);

	/* adjust if necessary - hardware incompatibility*/
	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
			CPUFREQ_INCOMPATIBLE, new_policy);

	/*
	 * verify the cpu speed can be set within this limit, which might be
	 * different to the first one
	 */
	ret = cpufreq_driver->verify(new_policy);
	if (ret)
		goto error_out;

	/* notification of the new policy */
	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
			CPUFREQ_NOTIFY, new_policy);

	policy->min = new_policy->min;
	policy->max = new_policy->max;
	trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);

	pr_debug("new min and max freqs are %u - %u kHz\n",
					policy->min, policy->max);

	if (cpufreq_driver->setpolicy) {//驱动未实现此函数
		policy->policy = new_policy->policy;
		pr_debug("setting range\n");
		ret = cpufreq_driver->setpolicy(new_policy);
	} else {
		if (new_policy->governor != policy->governor) {//前面设置了policy->governor为NULL,两个肯定不同
			/* save old, working values */
			struct cpufreq_governor *old_gov = policy->governor;

			pr_debug("governor switch\n");

			/* end old governor */
			if (policy->governor) {
				__cpufreq_governor(policy, CPUFREQ_GOV_STOP);
				up_write(&policy->rwsem);
				__cpufreq_governor(policy,
						CPUFREQ_GOV_POLICY_EXIT);
				down_write(&policy->rwsem);
			}

			/* start new governor */
			//执行governor init,start,limit函数
			policy->governor = new_policy->governor;
			if (!__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) {
				if (!__cpufreq_governor(policy, CPUFREQ_GOV_START)) {
					failed = 0;
				} else {
					up_write(&policy->rwsem);
					__cpufreq_governor(policy,
							CPUFREQ_GOV_POLICY_EXIT);
					down_write(&policy->rwsem);
				}
			}

			if (failed) {
				/* new governor failed, so re-start old one */
				pr_debug("starting governor %s failed\n",
							policy->governor->name);
				if (old_gov) {
					policy->governor = old_gov;
					__cpufreq_governor(policy,
							CPUFREQ_GOV_POLICY_INIT);
					__cpufreq_governor(policy,
							   CPUFREQ_GOV_START);
				}
				ret = -EINVAL;
				goto error_out;
			}
			/* might be a policy change, too, so fall through */
		}
		pr_debug("governor: change or update limits\n");
		ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
	}

error_out:
	return ret;
}

static int __cpufreq_governor(struct cpufreq_policy *policy,
					unsigned int event)
{
	int ret;

	/* Only must be defined when default governor is known to have latency
	   restrictions, like e.g. conservative or ondemand.
	   That this is the case is already ensured in Kconfig
	*/
#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
	struct cpufreq_governor *gov = &cpufreq_gov_performance;
#else
	struct cpufreq_governor *gov = NULL;
#endif

	if (policy->governor->max_transition_latency &&
	    policy->cpuinfo.transition_latency >
	    policy->governor->max_transition_latency) {
		if (!gov)
			return -EINVAL;
		else {
			printk(KERN_WARNING "%s governor failed, too long"
			       " transition latency of HW, fallback"
			       " to %s governor\n",
			       policy->governor->name,
			       gov->name);
			policy->governor = gov;
		}
	}

	if (event == CPUFREQ_GOV_POLICY_INIT)
		if (!try_module_get(policy->governor->owner))
			return -EINVAL;

	pr_debug("__cpufreq_governor for CPU %u, event %u\n",
						policy->cpu, event);

	mutex_lock(&cpufreq_governor_lock);
	if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
	    || (!policy->governor_enabled
	    && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) {
		mutex_unlock(&cpufreq_governor_lock);
		return -EBUSY;
	}

	if (event == CPUFREQ_GOV_STOP)
		policy->governor_enabled = false;
	else if (event == CPUFREQ_GOV_START)
		policy->governor_enabled = true;

	mutex_unlock(&cpufreq_governor_lock);

	//执行governor主体函数

	//初始化时先采用performance模式,以最高频率运行,保证开机速度,
	//之后会通过transition notify切换到interactive
	ret = policy->governor->governor(policy, event);

	if (!ret) {
		if (event == CPUFREQ_GOV_POLICY_INIT)
			policy->governor->initialized++;
		else if (event == CPUFREQ_GOV_POLICY_EXIT)
			policy->governor->initialized--;
	} else {
		/* Restore original values */
		mutex_lock(&cpufreq_governor_lock);
		if (event == CPUFREQ_GOV_STOP)
			policy->governor_enabled = true;
		else if (event == CPUFREQ_GOV_START)
			policy->governor_enabled = false;
		mutex_unlock(&cpufreq_governor_lock);
	}

	if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
			((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
		module_put(policy->governor->owner);

	return ret;
}

//interactive 使用到的数据结构,interactive是通过定时采样cpu workload,确定cpu的工作频率的,
//主要数据结构是timer跟struct cpufreq_interactive_tunables
struct cpufreq_interactive_cpuinfo {
	//重要的两个timer list
	struct timer_list cpu_timer;
	struct timer_list cpu_slack_timer;
	spinlock_t load_lock; /* protects the next 4 fields */
	u64 time_in_idle; //cpu idle 时间
	u64 time_in_idle_timestamp;//更新time_in_idle时间戳
	u64 cputime_speedadj;
	u64 cputime_speedadj_timestamp; //cputime_speedadj_timestamp=time_in_idle_timestamp,定时器每次启动时统计idle的时间戳
	u64 last_evaluated_jiffy;
	struct cpufreq_policy *policy;
	struct cpufreq_frequency_table *freq_table; //frequency?
	spinlock_t target_freq_lock; /*protects target freq */
	unsigned int target_freq;//目标频率
	unsigned int floor_freq;
	unsigned int max_freq;
	unsigned int min_freq;
	u64 floor_validate_time;
	u64 local_fvtime; /* per-cpu floor_validate_time */
	u64 hispeed_validate_time; /* cluster hispeed_validate_time */
	u64 local_hvtime; /* per-cpu hispeed_validate_time */
	u64 max_freq_hyst_start_time;
	struct rw_semaphore enable_sem;
	bool reject_notification;
	int governor_enabled;//governor有效标示
	struct cpufreq_interactive_tunables *cached_tunables;
	int first_cpu;
};

static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);

/* realtime thread handles frequency scaling */
static struct task_struct *speedchange_task;
static cpumask_t speedchange_cpumask;
static spinlock_t speedchange_cpumask_lock;
static struct mutex gov_lock;

static int set_window_count;
static int migration_register_count;
static struct mutex sched_lock;

/* Target load.  Lower values result in higher CPU speeds. */
#define DEFAULT_TARGET_LOAD 90
static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};

#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC)
#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_TIMER_RATE
static unsigned int default_above_hispeed_delay[] = {//20000us
	DEFAULT_ABOVE_HISPEED_DELAY };
/*

*/
struct cpufreq_interactive_tunables {
	int usage_count;//tunable引用计数
	/* Hi speed to bump to from lo speed when load burst (default max) */
	//负载超过go_hispeed_load时,频率就被增大到此数值,默认为policy初始化时最大值
	//这个值是个中间值,高负载持续时间超过above_hispeed_delay,cpu频率继续升高
	unsigned int hispeed_freq;
	/* Go to hi speed when CPU load at or above this value. */
#define DEFAULT_GO_HISPEED_LOAD 99
	//高频阈值,默认是99%,超过此值就提高cpu 频率,否则降频
	unsigned long go_hispeed_load;
	/* Target load. Lower values result in higher CPU speeds. */
	spinlock_t target_loads_lock;
	//数组,表示cpu期望的负载,cpu需要调整频率,使得当前负载接近这个值
	//这个值越小,cpu频率就越高,此值是取得频率值
	unsigned int *target_loads;
	int ntarget_loads;//target_loads 数组大小
	/*
	 * The minimum amount of time to spend at a frequency before we can ramp
	 * down.
	 */
#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)//80ms采样一次
	unsigned long min_sample_time;//最小采样时间80000us
	/*
	 * The sample rate of the timer used to increase frequency
	 */
	//当CPU不处于idel状态时,timer_rate作为定时器采样速率来计算CPU的workload
	unsigned long timer_rate;
	/*
	 * Wait this long before raising speed above hispeed, by default a
	 * single timer interval.
	 */
	spinlock_t above_hispeed_delay_lock;
	
	//当CPU频率大于等于hispeed_freq,并且此时workload仍在不停增加(continued high load),系统将等待一个above_hispeed_delay的时间,再提升cpu频率
	unsigned int *above_hispeed_delay;//此变量是个数组,不同的频率范围,delay时间不同
	//default_above_hispeed_delays数组中元素的个数
	int nabove_hispeed_delay;
	
	/* Non-zero means indefinite speed boost active */
	/*
	echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost
	此时会立即将所有CPU的频率提高到至少hispeed_freq.写入0时,根据workload降低频率.默认为0.
boostpulse,每次触发boost功能时,立即拉高所有CPU的频率到hispeed_freq并保持在该频率至少boostpulse_duration的时间,在这段时间以后,根据当前的workload,频率才允许被降低。
	*/
	int boost_val;
	/* Duration of a boot pulse in usecs */
	//每次超频的持续时间
	int boostpulse_duration_val;
	/* End time of boost pulse in ktime converted to usecs */
	//超频结束时间
	u64 boostpulse_endtime;
	bool boosted;//超频,echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost, CPU的频率提高到至少hispeed_freq
	/*
	 * Max additional time to wait in idle, beyond timer_rate, at speeds
	 * above minimum before wakeup to reduce speed, or -1 if unnecessary.
	 */
	 /*
	 当CPU处于idel状态,此时使用一个可延时定时器,
	 会导致CPU不能从idel状态苏醒来响应定时器. 
	 定时器的最大的可延时时间用timer_slack表示,默认值80000 uS.
	 此处采用默认值
	 */
#define DEFAULT_TIMER_SLACK (4 * DEFAULT_TIMER_RATE)
	int timer_slack_val;
	bool io_is_busy;

	/* scheduler input related flags */
	bool use_sched_load;
	bool use_migration_notif;

	/*
	 * Whether to align timer windows across all CPUs. When
	 * use_sched_load is true, this flag is ignored and windows
	 * will always be aligned.
	 */
	bool align_windows;

	/*
	 * Stay at max freq for at least max_freq_hysteresis before dropping
	 * frequency.
	 */
	unsigned int max_freq_hysteresis;
};

//interactive初始化,初始化每个cpu定时器,创建内核线程,并将governor注册到cpufreq中
static int __init cpufreq_interactive_init(void)
{
	unsigned int i;
	struct cpufreq_interactive_cpuinfo *pcpu;
	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

	/* Initalize per-cpu timers */
	//遍历cpu,初始化每个cpu timer
	for_each_possible_cpu(i) {
		pcpu = &per_cpu(cpuinfo, i);
		init_timer_deferrable(&pcpu->cpu_timer);
		pcpu->cpu_timer.function = cpufreq_interactive_timer;
		pcpu->cpu_timer.data = i; //cpu id
		init_timer(&pcpu->cpu_slack_timer);//初始化可延期定时器
		pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
		spin_lock_init(&pcpu->load_lock);
		spin_lock_init(&pcpu->target_freq_lock);
		init_rwsem(&pcpu->enable_sem);
	}

	spin_lock_init(&speedchange_cpumask_lock);
	mutex_init(&gov_lock);
	mutex_init(&sched_lock);
	speedchange_task =
		kthread_create(cpufreq_interactive_speedchange_task, NULL,
			       "cfinteractive");
	if (IS_ERR(speedchange_task))
		return PTR_ERR(speedchange_task);

	sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m);
	get_task_struct(speedchange_task);

	/* NB: wake up so the thread does not look hung to the freezer */
	//创建内核线程后,加入cpu的运行队列,等待调度执行
	wake_up_process(speedchange_task);
	//注册governor到内核
	return cpufreq_register_governor(&cpufreq_gov_interactive);
}

//在cpufreq中会调用到interactive主体函数cpufreq_governor_interactive
//
static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
		unsigned int event)

{
	int rc;
	unsigned int j;
	struct cpufreq_interactive_cpuinfo *pcpu;
	struct cpufreq_frequency_table *freq_table;
	struct cpufreq_interactive_tunables *tunables;
	unsigned long flags;
	int first_cpu;
	//每个policy有不同的governor,高通在dst中有设置此项,每个governor有自己的tunables
	if (have_governor_per_policy())
		tunables = policy->governor_data;
	else
		tunables = common_tunables;

	BUG_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));

	switch (event) {
	case CPUFREQ_GOV_POLICY_INIT:
		if (have_governor_per_policy()) {
			WARN_ON(tunables);
		} else if (tunables) {//公用一个governor的tunables
			tunables->usage_count++;
			policy->governor_data = tunables;
			return 0;
		}
		//policy管理的第一个cpu
		first_cpu = cpumask_first(policy->related_cpus);
		for_each_cpu(j, policy->related_cpus)//遍历policy管理的所有cpu,并给其中的first_cpu赋值
			per_cpu(cpuinfo, j).first_cpu = first_cpu;
		//获取cpu 0或者policy 第一个cpu的cached_tunables
		tunables = restore_tunables(policy);
		if (!tunables) {
			tunables = alloc_tunable(policy);//分配一个tunables结构
			if (IS_ERR(tunables))
				return PTR_ERR(tunables);
		}

		tunables->usage_count = 1;
		policy->governor_data = tunables;
		if (!have_governor_per_policy()) {
			WARN_ON(cpufreq_get_global_kobject());
			common_tunables = tunables; //公用tunable
		}
		//在cpufreq目录下创建interactive目录,并建立属性文件
		rc = sysfs_create_group(get_governor_parent_kobj(policy),
				get_sysfs_attr());
		if (rc) {
			kfree(tunables);
			policy->governor_data = NULL;
			if (!have_governor_per_policy()) {
				common_tunables = NULL;
				cpufreq_put_global_kobject();
			}
			return rc;
		}
		//governor未初始化,需要注册notify
		if (!policy->governor->initialized) {
			idle_notifier_register(&cpufreq_interactive_idle_nb);//idle notify
			cpufreq_register_notifier(&cpufreq_notifier_block, //cpu change freq notify
					CPUFREQ_TRANSITION_NOTIFIER);
		}

		if (tunables->use_sched_load)
			cpufreq_interactive_enable_sched_input(tunables);

		break;

	case CPUFREQ_GOV_POLICY_EXIT:
		if (!--tunables->usage_count) {
			if (policy->governor->initialized == 1) {//取消notify注册
				cpufreq_unregister_notifier(&cpufreq_notifier_block,
						CPUFREQ_TRANSITION_NOTIFIER);
				idle_notifier_unregister(&cpufreq_interactive_idle_nb);
			}
			//移除interactive sysfs
			sysfs_remove_group(get_governor_parent_kobj(policy),
					get_sysfs_attr());
			if (!have_governor_per_policy())
				cpufreq_put_global_kobject();
			common_tunables = NULL;
		}

		policy->governor_data = NULL;

		if (tunables->use_sched_load)
			cpufreq_interactive_disable_sched_input(tunables);

		break;

	case CPUFREQ_GOV_START://关键是启动workload采样定时器
		mutex_lock(&gov_lock);
		//得到cpu频率表
		freq_table = cpufreq_frequency_get_table(policy->cpu);
		if (!tunables->hispeed_freq)
			tunables->hispeed_freq = policy->max;
		//遍历policy管理的所有cpu
		for_each_cpu(j, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, j);
			pcpu->policy = policy;
			pcpu->target_freq = policy->cur;
			pcpu->freq_table = freq_table;
			pcpu->floor_freq = pcpu->target_freq;
			pcpu->floor_validate_time =
				ktime_to_us(ktime_get());
			pcpu->local_fvtime = pcpu->floor_validate_time;
			pcpu->hispeed_validate_time =
				pcpu->floor_validate_time;
			pcpu->local_hvtime = pcpu->floor_validate_time;
			pcpu->max_freq = policy->max;
			pcpu->min_freq = policy->min;
			pcpu->reject_notification = true;
			down_write(&pcpu->enable_sem);
			del_timer_sync(&pcpu->cpu_timer);//cpu 计算workload定时器
			del_timer_sync(&pcpu->cpu_slack_timer);//cpu idle状态定时器
			pcpu->last_evaluated_jiffy = get_jiffies_64();
			cpufreq_interactive_timer_start(tunables, j);
			pcpu->governor_enabled = 1;
			up_write(&pcpu->enable_sem);
			pcpu->reject_notification = false;
		}

		mutex_unlock(&gov_lock);
		break;

	case CPUFREQ_GOV_STOP://停止定时器工作
		mutex_lock(&gov_lock);
		for_each_cpu(j, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, j);
			pcpu->reject_notification = true;
			down_write(&pcpu->enable_sem);
			pcpu->governor_enabled = 0;
			pcpu->target_freq = 0;
			del_timer_sync(&pcpu->cpu_timer);
			del_timer_sync(&pcpu->cpu_slack_timer);
			up_write(&pcpu->enable_sem);
			pcpu->reject_notification = false;
		}

		mutex_unlock(&gov_lock);
		break;

	case CPUFREQ_GOV_LIMITS://设定目标频率后,修改定时器
		__cpufreq_driver_target(policy,
				policy->cur, CPUFREQ_RELATION_L);
		for_each_cpu(j, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, j);

			down_read(&pcpu->enable_sem);
			if (pcpu->governor_enabled == 0) {
				up_read(&pcpu->enable_sem);
				continue;
			}

			spin_lock_irqsave(&pcpu->target_freq_lock, flags);
			if (policy->max < pcpu->target_freq)
				pcpu->target_freq = policy->max;
			else if (policy->min > pcpu->target_freq)
				pcpu->target_freq = policy->min;

			spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);

			if (policy->min < pcpu->min_freq)
				cpufreq_interactive_timer_resched(j, true);
			pcpu->min_freq = policy->min;

			up_read(&pcpu->enable_sem);

			/* Reschedule timer only if policy->max is raised.
			 * Delete the timers, else the timer callback may
			 * return without re-arm the timer when failed
			 * acquire the semaphore. This race may cause timer
			 * stopped unexpectedly.
			 */

			if (policy->max > pcpu->max_freq) {
				pcpu->reject_notification = true;
				down_write(&pcpu->enable_sem);
				del_timer_sync(&pcpu->cpu_timer);
				del_timer_sync(&pcpu->cpu_slack_timer);
				cpufreq_interactive_timer_resched(j, false);
				up_write(&pcpu->enable_sem);
				pcpu->reject_notification = false;
			}

			pcpu->max_freq = policy->max;
		}
		break;
	}
	return 0;
}

/*继续分析定时器操作,定期计算cpu 当前workload,确定调频的目标频率,调整cpu频率
cpu workload简单来讲就是cpu 活跃时间占采样间隔的百分比
参数data是cpu id ,is_notif=false
*/
static void __cpufreq_interactive_timer(unsigned long data, bool is_notif)
{
	u64 now;
	unsigned int delta_time;
	u64 cputime_speedadj;
	int cpu_load;
	struct cpufreq_interactive_cpuinfo *pcpu =
		&per_cpu(cpuinfo, data);
	//cpu 调频信息
	struct cpufreq_interactive_tunables *tunables =
		pcpu->policy->governor_data;
	unsigned int new_freq;
	unsigned int loadadjfreq;
	unsigned int index;
	unsigned long flags;
	struct cpufreq_govinfo int_info;
	u64 max_fvtime;

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled)
		goto exit;

	spin_lock_irqsave(&pcpu->load_lock, flags);
	pcpu->last_evaluated_jiffy = get_jiffies_64();//获取当前系统启动时间
	now = update_load(data);//获取开机总时长
	if (tunables->use_sched_load) {//这个值为1 ,表示使用调度器计算出来的负载
		/*
		 * Unlock early to avoid deadlock.
		 *
		 * load_change_callback() for thread migration already
		 * holds rq lock. Then it locks load_lock to avoid racing
		 * with cpufreq_interactive_timer_resched/start().
		 * sched_get_busy() will also acquire rq lock. Thus we
		 * can't hold load_lock when calling sched_get_busy().
		 *
		 * load_lock used in this function protects time
		 * and load information. These stats are not used when
		 * scheduler input is available. Thus unlocking load_lock
		 * early is perfectly OK.
		 */
		spin_unlock_irqrestore(&pcpu->load_lock, flags);
		//cpu工作总输出
		cputime_speedadj = (u64)sched_get_busy(data) *
				pcpu->policy->cpuinfo.max_freq;
		do_div(cputime_speedadj, tunables->timer_rate);//一个定时周期内的cpu 工作量
		//cputime_speedadj=cputime_speedadj/tunables->timer_rate
	} else {
		delta_time = (unsigned int)//两次统计idle时间的间隔
				(now - pcpu->cputime_speedadj_timestamp);
		cputime_speedadj = pcpu->cputime_speedadj;//cpu活跃时间跟频率的乘积
		spin_unlock_irqrestore(&pcpu->load_lock, flags);
		if (WARN_ON_ONCE(!delta_time))
			goto rearm;
		do_div(cputime_speedadj, delta_time);
	}
	//kernel不支持浮点运算,才会进行转换 *100运算结果不会出现小数
	loadadjfreq = (unsigned int)cputime_speedadj * 100;

	int_info.cpu = data;
	int_info.load = loadadjfreq / pcpu->policy->max;
	int_info.sampling_rate_us = tunables->timer_rate;
	//notify通过qcom驱动检查当前workload perf_govinfo_notify
	atomic_notifier_call_chain(&cpufreq_govinfo_notifier_list,
					CPUFREQ_LOAD_CHANGE, &int_info);

	spin_lock_irqsave(&pcpu->target_freq_lock, flags);
	//cpu负载跟cpu 工作频率乘积当做一段时间内cpu工作总量
	cpu_load = loadadjfreq / pcpu->policy->cur;
	
	//是否有开启超频
	tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

	/*
	cpu_load 大于go_hispeed_load或者开启超频后,
	new_freq需要设置为大于等于tunables->hispeed_freq
	*/
	if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
		if (pcpu->policy->cur < tunables->hispeed_freq &&
		    cpu_load <= MAX_LOCAL_LOAD) {//当前频率未达到最大频率,可以直接设置为最大频率
			new_freq = tunables->hispeed_freq;
		} else {//此时需要选择比hispeed_freq更大的频率
			new_freq = choose_freq(pcpu, loadadjfreq);

			if (new_freq < tunables->hispeed_freq)
				new_freq = tunables->hispeed_freq;
		}
	} else {//choose_freq 设定新的频率,此时需要降频处理
		new_freq = choose_freq(pcpu, loadadjfreq);
	}
	//检测是否有达到改变频率的条件,尤其是两次采样间隔小于调整到对应频率
	//的delay时间的话,就跳过调频
	if (cpu_load <= MAX_LOCAL_LOAD &&
	    pcpu->policy->cur >= tunables->hispeed_freq &&
	    new_freq > pcpu->policy->cur &&
	    now - pcpu->hispeed_validate_time <
	    freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {//不同频率,调频等待的间隔不同
		trace_cpufreq_interactive_notyet(
			data, cpu_load, pcpu->target_freq,
			pcpu->policy->cur, new_freq);
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	pcpu->local_hvtime = now;

	//取freq table中大于或等于new_freq的频率中最小的一个频率
	if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,
					   new_freq, CPUFREQ_RELATION_L,
					   &index)) {
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	new_freq = pcpu->freq_table[index].frequency;
	//此条件下,同样不需要调频
	if (!is_notif && new_freq < pcpu->target_freq &&
	    now - pcpu->max_freq_hyst_start_time <
	    tunables->max_freq_hysteresis) {
		trace_cpufreq_interactive_notyet(data, cpu_load,
			pcpu->target_freq, pcpu->policy->cur, new_freq);
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	/*
	 * Do not scale below floor_freq unless we have been at or above the
	 * floor frequency for the minimum sample time since last validated.
	 */
	//当new_freq < pcpu->floor_freq,并且两次floor_validate_time的间隔小于min_sample_time,此时不需要更新频率
	max_fvtime = max(pcpu->floor_validate_time, pcpu->local_fvtime);
	if (!is_notif && new_freq < pcpu->floor_freq &&
	    pcpu->target_freq >= pcpu->policy->cur) {
		if (now - max_fvtime < tunables->min_sample_time) {
			trace_cpufreq_interactive_notyet(
				data, cpu_load, pcpu->target_freq,
				pcpu->policy->cur, new_freq);
			spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
			goto rearm;
		}
	}

	/*
	 * Update the timestamp for checking whether speed has been held at
	 * or above the selected frequency for a minimum of min_sample_time,
	 * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we
	 * allow the speed to drop as soon as the boostpulse duration expires
	 * (or the indefinite boost is turned off).
	 */
	//不超频,并且new_freq>hispeed_freq
	if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
		pcpu->floor_freq = new_freq;
		if (pcpu->target_freq >= pcpu->policy->cur ||
		    new_freq >= pcpu->policy->cur)
			pcpu->local_fvtime = now;
	}

	if (new_freq == pcpu->policy->max)
		pcpu->max_freq_hyst_start_time = now;

	if (pcpu->target_freq == new_freq) {
		trace_cpufreq_interactive_already(
			data, cpu_load, pcpu->target_freq,
			pcpu->policy->cur, new_freq);
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,
					 pcpu->policy->cur, new_freq);

	pcpu->target_freq = new_freq;//更新本次计算后最终的目标频率,更新之前表示上次选频的目标频率
	spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
	spin_lock_irqsave(&speedchange_cpumask_lock, flags);
	cpumask_set_cpu(data, &speedchange_cpumask);//当前cpu加入调频cpu列表
	spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
	wake_up_process(speedchange_task);//唤醒调频处理线程,这个线程里面进行真正的调频处理

rearm:
	if (!timer_pending(&pcpu->cpu_timer))//跳过此次调频操作,重启定时器
		cpufreq_interactive_timer_resched(data, false);

exit:
	up_read(&pcpu->enable_sem);
	return;
}

static u64 update_load(int cpu)
{
	struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
	struct cpufreq_interactive_tunables *tunables =
		pcpu->policy->governor_data;
	u64 now;
	u64 now_idle;
	unsigned int delta_idle;
	unsigned int delta_time;
	u64 active_time;
	//now_idle:cpu自启动后总的idle时间,now:总开机时间,当前时间戳
	now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
	delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle);//上次计算workload时idle差值
	delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp);//距离上次计算负载的时间间隔

	if (delta_time <= delta_idle)
		active_time = 0;
	else
		active_time = delta_time - delta_idle;//cpu 活跃时间
	//一个定时周期的cpu活跃时间跟当前运行频率的乘积,
	//每次定时器启动时cputime_speedadj被设置为0,
	//此值可以表示定时周期内cpu工作的总输出
	pcpu->cputime_speedadj += active_time * pcpu->policy->cur;


	pcpu->time_in_idle = now_idle;//idle时间戳
	pcpu->time_in_idle_timestamp = now;//time_in_idle_timestamp:每次计算负载的时间戳
	return now;
}
 /*
 choose_freq函数用来选频,使选频后的系统workload小于或等于target load
 核心思想是:选择最小的频率来满足target load
 loadadjfreq一段时间内工作量
 */
static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,
		unsigned int loadadjfreq)
{
	unsigned int freq = pcpu->policy->cur;//当前频率
	unsigned int prevfreq, freqmin, freqmax;
	unsigned int tl;//target load
	int index;

	freqmin = 0;
	freqmax = UINT_MAX;

	do {
		prevfreq = freq;
		//计算当前频率对应的workload
		tl = freq_to_targetload(pcpu->policy->governor_data, freq);

		/*
		 * Find the lowest frequency where the computed load is less
		 * than or equal to the target load.
		 */
		//从freq_table中获取最优频率对应的index,取大于等于loadadjfreq / tl (target freq)的最小值
		if (cpufreq_frequency_table_target(
			    pcpu->policy, pcpu->freq_table, loadadjfreq / tl,
			    CPUFREQ_RELATION_L, &index))
			break;
		freq = pcpu->freq_table[index].frequency;

		if (freq > prevfreq) {//提高频率
			/* The previous frequency is too low. */
			freqmin = prevfreq;

			if (freq >= freqmax) {
				/*
				 * Find the highest frequency that is less
				 * than freqmax.
				 */
				if (cpufreq_frequency_table_target(
					    pcpu->policy, pcpu->freq_table,
					    freqmax - 1, CPUFREQ_RELATION_H,
					    &index))
					break;
				freq = pcpu->freq_table[index].frequency;

				if (freq == freqmin) {
					/*
					 * The first frequency below freqmax
					 * has already been found to be too
					 * low.  freqmax is the lowest speed
					 * we found that is fast enough.
					 */
					freq = freqmax;
					break;
				}
			}
		} else if (freq < prevfreq) {
			/* The previous frequency is high enough. */
			freqmax = prevfreq;

			if (freq <= freqmin) {
				/*
				 * Find the lowest frequency that is higher
				 * than freqmin.
				 */
				if (cpufreq_frequency_table_target(
					    pcpu->policy, pcpu->freq_table,
					    freqmin + 1, CPUFREQ_RELATION_L,
					    &index))
					break;
				freq = pcpu->freq_table[index].frequency;

				/*
				 * If freqmax is the first frequency above
				 * freqmin then we have already found that
				 * this speed is fast enough.
				 */
				if (freq == freqmax)
					break;
			}
		}

		/* If same frequency chosen as previous then done. */
	} while (freq != prevfreq);

	return freq;
}

/*
CPU的频率设置为所有CPU的pcpu->target_freq值中最大的那一个
*/
static int cpufreq_interactive_speedchange_task(void *data)
{
	unsigned int cpu;
	cpumask_t tmp_mask;
	unsigned long flags;
	struct cpufreq_interactive_cpuinfo *pcpu;

	while (1) {
		set_current_state(TASK_INTERRUPTIBLE);
		spin_lock_irqsave(&speedchange_cpumask_lock, flags);

		if (cpumask_empty(&speedchange_cpumask)) {//没有需要调频的cpu,调度执行其他task
			spin_unlock_irqrestore(&speedchange_cpumask_lock,
					       flags);
			schedule();

			if (kthread_should_stop())
				break;

			spin_lock_irqsave(&speedchange_cpumask_lock, flags);
		}

		set_current_state(TASK_RUNNING);
		tmp_mask = speedchange_cpumask;
		cpumask_clear(&speedchange_cpumask);//清空cpumask
		spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);

		for_each_cpu(cpu, &tmp_mask) {//遍历所有需要调频cpu
			unsigned int j;
			unsigned int max_freq = 0;
			struct cpufreq_interactive_cpuinfo *pjcpu;
			u64 hvt = ~0ULL, fvt = 0;

			pcpu = &per_cpu(cpuinfo, cpu);
			if (!down_read_trylock(&pcpu->enable_sem))
				continue;
			if (!pcpu->governor_enabled) {
				up_read(&pcpu->enable_sem);
				continue;
			}
			//如果多个cpu公用一个policy,找到公用policy 的cpu 目标频率最大的值
			for_each_cpu(j, pcpu->policy->cpus) {
				pjcpu = &per_cpu(cpuinfo, j);

				fvt = max(fvt, pjcpu->local_fvtime);
				if (pjcpu->target_freq > max_freq) {
					max_freq = pjcpu->target_freq;
					hvt = pjcpu->local_hvtime;
				} else if (pjcpu->target_freq == max_freq) {
					hvt = min(hvt, pjcpu->local_hvtime);
				}
			}
			for_each_cpu(j, pcpu->policy->cpus) {//写法不够简洁
				pjcpu = &per_cpu(cpuinfo, j);
				pjcpu->floor_validate_time = fvt;
			}
			//修改管理policy的cpu clock,共用同一个policy的cpu clock会一起改变
			if (max_freq != pcpu->policy->cur) {
				//调用驱动调频的接口
				__cpufreq_driver_target(pcpu->policy,
							max_freq,
							CPUFREQ_RELATION_H);
				for_each_cpu(j, pcpu->policy->cpus) {//更新cpu的hispeed_validate_time
					pjcpu = &per_cpu(cpuinfo, j);
					pjcpu->hispeed_validate_time = hvt;
				}
			}
			trace_cpufreq_interactive_setspeed(cpu,
						     pcpu->target_freq,
						     pcpu->policy->cur);

			up_read(&pcpu->enable_sem);
		}
	}

	return 0;
}


你可能感兴趣的:(linux cpufreq interactive调频代码实现下)