forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
intel_pstate.c
3570 lines (2869 loc) · 90.5 KB
/
intel_pstate.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0-only
/*
* intel_pstate.c: Native P state management for Intel processors
*
* (C) Copyright 2012 Intel Corporation
* Author: Dirk Brandewie <[email protected]>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/sched/cpufreq.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/cpufreq.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/acpi.h>
#include <linux/vmalloc.h>
#include <linux/pm_qos.h>
#include <trace/events/power.h>
#include <asm/div64.h>
#include <asm/msr.h>
#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <asm/intel-family.h>
#include "../drivers/thermal/intel/thermal_interrupt.h"
#define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000
#define INTEL_CPUFREQ_TRANSITION_DELAY 500
#ifdef CONFIG_ACPI
#include <acpi/processor.h>
#include <acpi/cppc_acpi.h>
#endif
#define FRAC_BITS 8
#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
#define fp_toint(X) ((X) >> FRAC_BITS)
#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
#define EXT_BITS 6
#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
static inline int32_t mul_fp(int32_t x, int32_t y)
{
return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
}
static inline int32_t div_fp(s64 x, s64 y)
{
return div64_s64((int64_t)x << FRAC_BITS, y);
}
static inline int ceiling_fp(int32_t x)
{
int mask, ret;
ret = fp_toint(x);
mask = (1 << FRAC_BITS) - 1;
if (x & mask)
ret += 1;
return ret;
}
static inline u64 mul_ext_fp(u64 x, u64 y)
{
return (x * y) >> EXT_FRAC_BITS;
}
static inline u64 div_ext_fp(u64 x, u64 y)
{
return div64_u64(x << EXT_FRAC_BITS, y);
}
/**
* struct sample - Store performance sample
* @core_avg_perf: Ratio of APERF/MPERF which is the actual average
* performance during last sample period
* @busy_scaled: Scaled busy value which is used to calculate next
* P state. This can be different than core_avg_perf
* to account for cpu idle period
* @aperf: Difference of actual performance frequency clock count
* read from APERF MSR between last and current sample
* @mperf: Difference of maximum performance frequency clock count
* read from MPERF MSR between last and current sample
* @tsc: Difference of time stamp counter between last and
* current sample
* @time: Current time from scheduler
*
* This structure is used in the cpudata structure to store performance sample
* data for choosing next P State.
*/
struct sample {
int32_t core_avg_perf;
int32_t busy_scaled;
u64 aperf;
u64 mperf;
u64 tsc;
u64 time;
};
/**
* struct pstate_data - Store P state data
* @current_pstate: Current requested P state
* @min_pstate: Min P state possible for this platform
* @max_pstate: Max P state possible for this platform
* @max_pstate_physical:This is physical Max P state for a processor
* This can be higher than the max_pstate which can
* be limited by platform thermal design power limits
* @perf_ctl_scaling: PERF_CTL P-state to frequency scaling factor
* @scaling: Scaling factor between performance and frequency
* @turbo_pstate: Max Turbo P state possible for this platform
* @min_freq: @min_pstate frequency in cpufreq units
* @max_freq: @max_pstate frequency in cpufreq units
* @turbo_freq: @turbo_pstate frequency in cpufreq units
*
* Stores the per cpu model P state limits and current P state.
*/
struct pstate_data {
int current_pstate;
int min_pstate;
int max_pstate;
int max_pstate_physical;
int perf_ctl_scaling;
int scaling;
int turbo_pstate;
unsigned int min_freq;
unsigned int max_freq;
unsigned int turbo_freq;
};
/**
* struct vid_data - Stores voltage information data
* @min: VID data for this platform corresponding to
* the lowest P state
* @max: VID data corresponding to the highest P State.
* @turbo: VID data for turbo P state
* @ratio: Ratio of (vid max - vid min) /
* (max P state - Min P State)
*
* Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
* This data is used in Atom platforms, where in addition to target P state,
* the voltage data needs to be specified to select next P State.
*/
struct vid_data {
int min;
int max;
int turbo;
int32_t ratio;
};
/**
* struct global_params - Global parameters, mostly tunable via sysfs.
* @no_turbo: Whether or not to use turbo P-states.
* @turbo_disabled: Whether or not turbo P-states are available at all,
* based on the MSR_IA32_MISC_ENABLE value and whether or
* not the maximum reported turbo P-state is different from
* the maximum reported non-turbo one.
* @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq.
* @min_perf_pct: Minimum capacity limit in percent of the maximum turbo
* P-state capacity.
* @max_perf_pct: Maximum capacity limit in percent of the maximum turbo
* P-state capacity.
*/
struct global_params {
bool no_turbo;
bool turbo_disabled;
bool turbo_disabled_mf;
int max_perf_pct;
int min_perf_pct;
};
/**
* struct cpudata - Per CPU instance data storage
* @cpu: CPU number for this instance data
* @policy: CPUFreq policy value
* @update_util: CPUFreq utility callback information
* @update_util_set: CPUFreq utility callback is set
* @iowait_boost: iowait-related boost fraction
* @last_update: Time of the last update.
* @pstate: Stores P state limits for this CPU
* @vid: Stores VID limits for this CPU
* @last_sample_time: Last Sample time
* @aperf_mperf_shift: APERF vs MPERF counting frequency difference
* @prev_aperf: Last APERF value read from APERF MSR
* @prev_mperf: Last MPERF value read from MPERF MSR
* @prev_tsc: Last timestamp counter (TSC) value
* @prev_cummulative_iowait: IO Wait time difference from last and
* current sample
* @sample: Storage for storing last Sample data
* @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios
* @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios
* @acpi_perf_data: Stores ACPI perf information read from _PSS
* @valid_pss_table: Set to true for valid ACPI _PSS entries found
* @epp_powersave: Last saved HWP energy performance preference
* (EPP) or energy performance bias (EPB),
* when policy switched to performance
* @epp_policy: Last saved policy used to set EPP/EPB
* @epp_default: Power on default HWP energy performance
* preference/bias
* @epp_cached Cached HWP energy-performance preference value
* @hwp_req_cached: Cached value of the last HWP Request MSR
* @hwp_cap_cached: Cached value of the last HWP Capabilities MSR
* @last_io_update: Last time when IO wake flag was set
* @sched_flags: Store scheduler flags for possible cross CPU update
* @hwp_boost_min: Last HWP boosted min performance
* @suspended: Whether or not the driver has been suspended.
* @hwp_notify_work: workqueue for HWP notifications.
*
* This structure stores per CPU instance data for all CPUs.
*/
struct cpudata {
int cpu;
unsigned int policy;
struct update_util_data update_util;
bool update_util_set;
struct pstate_data pstate;
struct vid_data vid;
u64 last_update;
u64 last_sample_time;
u64 aperf_mperf_shift;
u64 prev_aperf;
u64 prev_mperf;
u64 prev_tsc;
u64 prev_cummulative_iowait;
struct sample sample;
int32_t min_perf_ratio;
int32_t max_perf_ratio;
#ifdef CONFIG_ACPI
struct acpi_processor_performance acpi_perf_data;
bool valid_pss_table;
#endif
unsigned int iowait_boost;
s16 epp_powersave;
s16 epp_policy;
s16 epp_default;
s16 epp_cached;
u64 hwp_req_cached;
u64 hwp_cap_cached;
u64 last_io_update;
unsigned int sched_flags;
u32 hwp_boost_min;
bool suspended;
struct delayed_work hwp_notify_work;
};
static struct cpudata **all_cpu_data;
/**
* struct pstate_funcs - Per CPU model specific callbacks
* @get_max: Callback to get maximum non turbo effective P state
* @get_max_physical: Callback to get maximum non turbo physical P state
* @get_min: Callback to get minimum P state
* @get_turbo: Callback to get turbo P state
* @get_scaling: Callback to get frequency scaling factor
* @get_cpu_scaling: Get frequency scaling factor for a given cpu
* @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference
* @get_val: Callback to convert P state to actual MSR write value
* @get_vid: Callback to get VID data for Atom platforms
*
* Core and Atom CPU models have different way to get P State limits. This
* structure is used to store those callbacks.
*/
struct pstate_funcs {
int (*get_max)(void);
int (*get_max_physical)(void);
int (*get_min)(void);
int (*get_turbo)(void);
int (*get_scaling)(void);
int (*get_cpu_scaling)(int cpu);
int (*get_aperf_mperf_shift)(void);
u64 (*get_val)(struct cpudata*, int pstate);
void (*get_vid)(struct cpudata *);
};
static struct pstate_funcs pstate_funcs __read_mostly;
static int hwp_active __read_mostly;
static int hwp_mode_bdw __read_mostly;
static bool per_cpu_limits __read_mostly;
static bool hwp_boost __read_mostly;
static struct cpufreq_driver *intel_pstate_driver __read_mostly;
#ifdef CONFIG_ACPI
static bool acpi_ppc;
#endif
static struct global_params global;
static DEFINE_MUTEX(intel_pstate_driver_lock);
static DEFINE_MUTEX(intel_pstate_limits_lock);
#ifdef CONFIG_ACPI
static bool intel_pstate_acpi_pm_profile_server(void)
{
if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
return true;
return false;
}
static bool intel_pstate_get_ppc_enable_status(void)
{
if (intel_pstate_acpi_pm_profile_server())
return true;
return acpi_ppc;
}
#ifdef CONFIG_ACPI_CPPC_LIB
/* The work item is needed to avoid CPU hotplug locking issues */
static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
{
sched_set_itmt_support();
}
static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
#define CPPC_MAX_PERF U8_MAX
static void intel_pstate_set_itmt_prio(int cpu)
{
struct cppc_perf_caps cppc_perf;
static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
int ret;
ret = cppc_get_perf_caps(cpu, &cppc_perf);
if (ret)
return;
/*
* On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff.
* In this case we can't use CPPC.highest_perf to enable ITMT.
* In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide.
*/
if (cppc_perf.highest_perf == CPPC_MAX_PERF)
cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached));
/*
* The priorities can be set regardless of whether or not
* sched_set_itmt_support(true) has been called and it is valid to
* update them at any time after it has been called.
*/
sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
if (max_highest_perf <= min_highest_perf) {
if (cppc_perf.highest_perf > max_highest_perf)
max_highest_perf = cppc_perf.highest_perf;
if (cppc_perf.highest_perf < min_highest_perf)
min_highest_perf = cppc_perf.highest_perf;
if (max_highest_perf > min_highest_perf) {
/*
* This code can be run during CPU online under the
* CPU hotplug locks, so sched_set_itmt_support()
* cannot be called from here. Queue up a work item
* to invoke it.
*/
schedule_work(&sched_itmt_work);
}
}
}
static int intel_pstate_get_cppc_guaranteed(int cpu)
{
struct cppc_perf_caps cppc_perf;
int ret;
ret = cppc_get_perf_caps(cpu, &cppc_perf);
if (ret)
return ret;
if (cppc_perf.guaranteed_perf)
return cppc_perf.guaranteed_perf;
return cppc_perf.nominal_perf;
}
static u32 intel_pstate_cppc_nominal(int cpu)
{
u64 nominal_perf;
if (cppc_get_nominal_perf(cpu, &nominal_perf))
return 0;
return nominal_perf;
}
#else /* CONFIG_ACPI_CPPC_LIB */
static inline void intel_pstate_set_itmt_prio(int cpu)
{
}
#endif /* CONFIG_ACPI_CPPC_LIB */
static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
int ret;
int i;
if (hwp_active) {
intel_pstate_set_itmt_prio(policy->cpu);
return;
}
if (!intel_pstate_get_ppc_enable_status())
return;
cpu = all_cpu_data[policy->cpu];
ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
policy->cpu);
if (ret)
return;
/*
* Check if the control value in _PSS is for PERF_CTL MSR, which should
* guarantee that the states returned by it map to the states in our
* list directly.
*/
if (cpu->acpi_perf_data.control_register.space_id !=
ACPI_ADR_SPACE_FIXED_HARDWARE)
goto err;
/*
* If there is only one entry _PSS, simply ignore _PSS and continue as
* usual without taking _PSS into account
*/
if (cpu->acpi_perf_data.state_count < 2)
goto err;
pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n",
(i == cpu->acpi_perf_data.state ? '*' : ' '), i,
(u32) cpu->acpi_perf_data.states[i].core_frequency,
(u32) cpu->acpi_perf_data.states[i].power,
(u32) cpu->acpi_perf_data.states[i].control);
}
/*
* The _PSS table doesn't contain whole turbo frequency range.
* This just contains +1 MHZ above the max non turbo frequency,
* with control value corresponding to max turbo ratio. But
* when cpufreq set policy is called, it will call with this
* max frequency, which will cause a reduced performance as
* this driver uses real max turbo frequency as the max
* frequency. So correct this frequency in _PSS table to
* correct max turbo frequency based on the turbo state.
* Also need to convert to MHz as _PSS freq is in MHz.
*/
if (!global.turbo_disabled)
cpu->acpi_perf_data.states[0].core_frequency =
policy->cpuinfo.max_freq / 1000;
cpu->valid_pss_table = true;
pr_debug("_PPC limits will be enforced\n");
return;
err:
cpu->valid_pss_table = false;
acpi_processor_unregister_performance(policy->cpu);
}
static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
cpu = all_cpu_data[policy->cpu];
if (!cpu->valid_pss_table)
return;
acpi_processor_unregister_performance(policy->cpu);
}
#else /* CONFIG_ACPI */
static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
{
}
static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
{
}
static inline bool intel_pstate_acpi_pm_profile_server(void)
{
return false;
}
#endif /* CONFIG_ACPI */
#ifndef CONFIG_ACPI_CPPC_LIB
static inline int intel_pstate_get_cppc_guaranteed(int cpu)
{
return -ENOTSUPP;
}
#endif /* CONFIG_ACPI_CPPC_LIB */
/**
* intel_pstate_hybrid_hwp_adjust - Calibrate HWP performance levels.
* @cpu: Target CPU.
*
* On hybrid processors, HWP may expose more performance levels than there are
* P-states accessible through the PERF_CTL interface. If that happens, the
* scaling factor between HWP performance levels and CPU frequency will be less
* than the scaling factor between P-state values and CPU frequency.
*
* In that case, adjust the CPU parameters used in computations accordingly.
*/
static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu)
{
int perf_ctl_max_phys = cpu->pstate.max_pstate_physical;
int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
int perf_ctl_turbo = pstate_funcs.get_turbo();
int turbo_freq = perf_ctl_turbo * perf_ctl_scaling;
int scaling = cpu->pstate.scaling;
pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, pstate_funcs.get_max());
pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo);
pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling);
pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate);
pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate);
pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
/*
* If the product of the HWP performance scaling factor and the HWP_CAP
* highest performance is greater than the maximum turbo frequency
* corresponding to the pstate_funcs.get_turbo() return value, the
* scaling factor is too high, so recompute it to make the HWP_CAP
* highest performance correspond to the maximum turbo frequency.
*/
cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling;
if (turbo_freq < cpu->pstate.turbo_freq) {
cpu->pstate.turbo_freq = turbo_freq;
scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate);
cpu->pstate.scaling = scaling;
pr_debug("CPU%d: refined HWP-to-frequency scaling factor: %d\n",
cpu->cpu, scaling);
}
cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling,
perf_ctl_scaling);
cpu->pstate.max_pstate_physical =
DIV_ROUND_UP(perf_ctl_max_phys * perf_ctl_scaling,
scaling);
cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
/*
* Cast the min P-state value retrieved via pstate_funcs.get_min() to
* the effective range of HWP performance levels.
*/
cpu->pstate.min_pstate = DIV_ROUND_UP(cpu->pstate.min_freq, scaling);
}
static inline void update_turbo_state(void)
{
u64 misc_en;
struct cpudata *cpu;
cpu = all_cpu_data[0];
rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
global.turbo_disabled =
(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
}
static int min_perf_pct_min(void)
{
struct cpudata *cpu = all_cpu_data[0];
int turbo_pstate = cpu->pstate.turbo_pstate;
return turbo_pstate ?
(cpu->pstate.min_pstate * 100 / turbo_pstate) : 0;
}
static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
{
u64 epb;
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENXIO;
ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret)
return (s16)ret;
return (s16)(epb & 0x0f);
}
static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
{
s16 epp;
if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
/*
* When hwp_req_data is 0, means that caller didn't read
* MSR_HWP_REQUEST, so need to read and get EPP.
*/
if (!hwp_req_data) {
epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
&hwp_req_data);
if (epp)
return epp;
}
epp = (hwp_req_data >> 24) & 0xff;
} else {
/* When there is no EPP present, HWP uses EPB settings */
epp = intel_pstate_get_epb(cpu_data);
}
return epp;
}
static int intel_pstate_set_epb(int cpu, s16 pref)
{
u64 epb;
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENXIO;
ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret)
return ret;
epb = (epb & ~0x0f) | pref;
wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
return 0;
}
/*
* EPP/EPB display strings corresponding to EPP index in the
* energy_perf_strings[]
* index String
*-------------------------------------
* 0 default
* 1 performance
* 2 balance_performance
* 3 balance_power
* 4 power
*/
enum energy_perf_value_index {
EPP_INDEX_DEFAULT = 0,
EPP_INDEX_PERFORMANCE,
EPP_INDEX_BALANCE_PERFORMANCE,
EPP_INDEX_BALANCE_POWERSAVE,
EPP_INDEX_POWERSAVE,
};
static const char * const energy_perf_strings[] = {
[EPP_INDEX_DEFAULT] = "default",
[EPP_INDEX_PERFORMANCE] = "performance",
[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
[EPP_INDEX_POWERSAVE] = "power",
NULL
};
static unsigned int epp_values[] = {
[EPP_INDEX_DEFAULT] = 0, /* Unused index */
[EPP_INDEX_PERFORMANCE] = HWP_EPP_PERFORMANCE,
[EPP_INDEX_BALANCE_PERFORMANCE] = HWP_EPP_BALANCE_PERFORMANCE,
[EPP_INDEX_BALANCE_POWERSAVE] = HWP_EPP_BALANCE_POWERSAVE,
[EPP_INDEX_POWERSAVE] = HWP_EPP_POWERSAVE,
};
static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp)
{
s16 epp;
int index = -EINVAL;
*raw_epp = 0;
epp = intel_pstate_get_epp(cpu_data, 0);
if (epp < 0)
return epp;
if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
if (epp == epp_values[EPP_INDEX_PERFORMANCE])
return EPP_INDEX_PERFORMANCE;
if (epp == epp_values[EPP_INDEX_BALANCE_PERFORMANCE])
return EPP_INDEX_BALANCE_PERFORMANCE;
if (epp == epp_values[EPP_INDEX_BALANCE_POWERSAVE])
return EPP_INDEX_BALANCE_POWERSAVE;
if (epp == epp_values[EPP_INDEX_POWERSAVE])
return EPP_INDEX_POWERSAVE;
*raw_epp = epp;
return 0;
} else if (boot_cpu_has(X86_FEATURE_EPB)) {
/*
* Range:
* 0x00-0x03 : Performance
* 0x04-0x07 : Balance performance
* 0x08-0x0B : Balance power
* 0x0C-0x0F : Power
* The EPB is a 4 bit value, but our ranges restrict the
* value which can be set. Here only using top two bits
* effectively.
*/
index = (epp >> 2) + 1;
}
return index;
}
static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
{
int ret;
/*
* Use the cached HWP Request MSR value, because in the active mode the
* register itself may be updated by intel_pstate_hwp_boost_up() or
* intel_pstate_hwp_boost_down() at any time.
*/
u64 value = READ_ONCE(cpu->hwp_req_cached);
value &= ~GENMASK_ULL(31, 24);
value |= (u64)epp << 24;
/*
* The only other updater of hwp_req_cached in the active mode,
* intel_pstate_hwp_set(), is called under the same lock as this
* function, so it cannot run in parallel with the update below.
*/
WRITE_ONCE(cpu->hwp_req_cached, value);
ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
if (!ret)
cpu->epp_cached = epp;
return ret;
}
static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
int pref_index, bool use_raw,
u32 raw_epp)
{
int epp = -EINVAL;
int ret;
if (!pref_index)
epp = cpu_data->epp_default;
if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
if (use_raw)
epp = raw_epp;
else if (epp == -EINVAL)
epp = epp_values[pref_index];
/*
* To avoid confusion, refuse to set EPP to any values different
* from 0 (performance) if the current policy is "performance",
* because those values would be overridden.
*/
if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
return -EBUSY;
ret = intel_pstate_set_epp(cpu_data, epp);
} else {
if (epp == -EINVAL)
epp = (pref_index - 1) << 2;
ret = intel_pstate_set_epb(cpu_data->cpu, epp);
}
return ret;
}
static ssize_t show_energy_performance_available_preferences(
struct cpufreq_policy *policy, char *buf)
{
int i = 0;
int ret = 0;
while (energy_perf_strings[i] != NULL)
ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
ret += sprintf(&buf[ret], "\n");
return ret;
}
cpufreq_freq_attr_ro(energy_performance_available_preferences);
static struct cpufreq_driver intel_pstate;
static ssize_t store_energy_performance_preference(
struct cpufreq_policy *policy, const char *buf, size_t count)
{
struct cpudata *cpu = all_cpu_data[policy->cpu];
char str_preference[21];
bool raw = false;
ssize_t ret;
u32 epp = 0;
ret = sscanf(buf, "%20s", str_preference);
if (ret != 1)
return -EINVAL;
ret = match_string(energy_perf_strings, -1, str_preference);
if (ret < 0) {
if (!boot_cpu_has(X86_FEATURE_HWP_EPP))
return ret;
ret = kstrtouint(buf, 10, &epp);
if (ret)
return ret;
if (epp > 255)
return -EINVAL;
raw = true;
}
/*
* This function runs with the policy R/W semaphore held, which
* guarantees that the driver pointer will not change while it is
* running.
*/
if (!intel_pstate_driver)
return -EAGAIN;
mutex_lock(&intel_pstate_limits_lock);
if (intel_pstate_driver == &intel_pstate) {
ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp);
} else {
/*
* In the passive mode the governor needs to be stopped on the
* target CPU before the EPP update and restarted after it,
* which is super-heavy-weight, so make sure it is worth doing
* upfront.
*/
if (!raw)
epp = ret ? epp_values[ret] : cpu->epp_default;
if (cpu->epp_cached != epp) {
int err;
cpufreq_stop_governor(policy);
ret = intel_pstate_set_epp(cpu, epp);
err = cpufreq_start_governor(policy);
if (!ret)
ret = err;
}
}
mutex_unlock(&intel_pstate_limits_lock);
return ret ?: count;
}
static ssize_t show_energy_performance_preference(
struct cpufreq_policy *policy, char *buf)
{
struct cpudata *cpu_data = all_cpu_data[policy->cpu];
int preference, raw_epp;
preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp);
if (preference < 0)
return preference;
if (raw_epp)
return sprintf(buf, "%d\n", raw_epp);
else
return sprintf(buf, "%s\n", energy_perf_strings[preference]);
}
cpufreq_freq_attr_rw(energy_performance_preference);
static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf)
{
struct cpudata *cpu = all_cpu_data[policy->cpu];
int ratio, freq;
ratio = intel_pstate_get_cppc_guaranteed(policy->cpu);
if (ratio <= 0) {
u64 cap;
rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap);
ratio = HWP_GUARANTEED_PERF(cap);
}
freq = ratio * cpu->pstate.scaling;
if (cpu->pstate.scaling != cpu->pstate.perf_ctl_scaling)
freq = rounddown(freq, cpu->pstate.perf_ctl_scaling);
return sprintf(buf, "%d\n", freq);
}
cpufreq_freq_attr_ro(base_frequency);
static struct freq_attr *hwp_cpufreq_attrs[] = {
&energy_performance_preference,
&energy_performance_available_preferences,
&base_frequency,
NULL,
};
static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
{
u64 cap;
rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap);
WRITE_ONCE(cpu->hwp_cap_cached, cap);
cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap);
cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap);
}
static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
{
int scaling = cpu->pstate.scaling;
__intel_pstate_get_hwp_cap(cpu);
cpu->pstate.max_freq = cpu->pstate.max_pstate * scaling;
cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling;
if (scaling != cpu->pstate.perf_ctl_scaling) {
int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
cpu->pstate.max_freq = rounddown(cpu->pstate.max_freq,
perf_ctl_scaling);
cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_freq,
perf_ctl_scaling);
}
}
static void intel_pstate_hwp_set(unsigned int cpu)
{
struct cpudata *cpu_data = all_cpu_data[cpu];
int max, min;
u64 value;
s16 epp;
max = cpu_data->max_perf_ratio;
min = cpu_data->min_perf_ratio;
if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
min = max;
rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
value &= ~HWP_MIN_PERF(~0L);
value |= HWP_MIN_PERF(min);
value &= ~HWP_MAX_PERF(~0L);
value |= HWP_MAX_PERF(max);
if (cpu_data->epp_policy == cpu_data->policy)
goto skip_epp;
cpu_data->epp_policy = cpu_data->policy;
if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
epp = intel_pstate_get_epp(cpu_data, value);
cpu_data->epp_powersave = epp;
/* If EPP read was failed, then don't try to write */
if (epp < 0)
goto skip_epp;
epp = 0;
} else {
/* skip setting EPP, when saved value is invalid */
if (cpu_data->epp_powersave < 0)
goto skip_epp;
/*
* No need to restore EPP when it is not zero. This
* means:
* - Policy is not changed
* - user has manually changed
* - Error reading EPB
*/
epp = intel_pstate_get_epp(cpu_data, value);
if (epp)
goto skip_epp;
epp = cpu_data->epp_powersave;
}