Skip to content

Commit

Permalink
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:

 - Add the initial implementation of SCHED_DEADLINE support: a real-time
   scheduling policy where tasks that meet their deadlines and
   periodically execute their instances in less than their runtime quota
   see real-time scheduling and won't miss any of their deadlines.
   Tasks that go over their quota get delayed (Available to privileged
   users for now)

 - Clean up and fix preempt_enable_no_resched() abuse all around the
   tree

 - Do sched_clock() performance optimizations on x86 and elsewhere

 - Fix and improve auto-NUMA balancing

 - Fix and clean up the idle loop

 - Apply various cleanups and fixes

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  sched: Fix __sched_setscheduler() nice test
  sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
  sched: Fix up attr::sched_priority warning
  sched: Fix up scheduler syscall LTP fails
  sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
  sched/core: Fix htmldocs warnings
  sched/deadline: No need to check p if dl_se is valid
  sched/deadline: Remove unused variables
  sched/deadline: Fix sparse static warnings
  m68k: Fix build warning in mac_via.h
  sched, thermal: Clean up preempt_enable_no_resched() abuse
  sched, net: Fixup busy_loop_us_clock()
  sched, net: Clean up preempt_enable_no_resched() abuse
  sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
  sched/preempt, locking: Rework local_bh_{dis,en}able()
  sched/clock, x86: Avoid a runtime condition in native_sched_clock()
  sched/clock: Fix up clear_sched_clock_stable()
  sched/clock, x86: Use a static_key for sched_clock_stable
  sched/clock: Remove local_irq_disable() from the clocks
  sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
  ...
  • Loading branch information
torvalds committed Jan 20, 2014
2 parents 9326657 + eaad451 commit a0fa1dd
Show file tree
Hide file tree
Showing 63 changed files with 3,775 additions and 626 deletions.
5 changes: 0 additions & 5 deletions Documentation/sysctl/kernel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -428,11 +428,6 @@ rate for each task.
numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.

numa_balancing_settle_count is how many scan periods must complete before
the schedule balancer stops pushing the task towards a preferred node. This
gives the scheduler a chance to place the task on an alternative node if the
preferred node is overloaded.

numa_balancing_migrate_deferred is how many page migrations get skipped
unconditionally, after a page migration is skipped because a page is shared
with other tasks. This reduces page migration overhead, and determines
Expand Down
2 changes: 1 addition & 1 deletion arch/arm/include/asm/unistd.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

#include <uapi/asm/unistd.h>

#define __NR_syscalls (380)
#define __NR_syscalls (384)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)

#define __ARCH_WANT_STAT64
Expand Down
2 changes: 2 additions & 0 deletions arch/arm/include/uapi/asm/unistd.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)

/*
* This may need to be greater than __NR_last_syscall+1 in order to
Expand Down
2 changes: 2 additions & 0 deletions arch/arm/kernel/calls.S
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,8 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
/* 380 */ CALL(sys_sched_setattr)
CALL(sys_sched_getattr)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
Expand Down
2 changes: 2 additions & 0 deletions arch/m68k/include/asm/mac_via.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@
extern volatile __u8 *via1,*via2;
extern int rbv_present,via_alt_mapping;

struct irq_desc;

extern void via_register_interrupts(void);
extern void via_irq_enable(int);
extern void via_irq_disable(int);
Expand Down
43 changes: 43 additions & 0 deletions arch/x86/include/asm/mwait.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_MWAIT_H
#define _ASM_X86_MWAIT_H

#include <linux/sched.h>

#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
Expand All @@ -13,4 +15,45 @@

#define MWAIT_ECX_INTERRUPT_BREAK 0x1

static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}

static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}

/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
}

__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__mwait(eax, ecx);
}
current_clr_polling();
}

#endif /* _ASM_X86_MWAIT_H */
23 changes: 0 additions & 23 deletions arch/x86/include/asm/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -700,29 +700,6 @@ static inline void sync_core(void)
#endif
}

static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}

static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}

static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}

extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_amd_e400_c1e_mask(void);

Expand Down
77 changes: 18 additions & 59 deletions arch/x86/include/asm/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/percpu.h>
#include <linux/interrupt.h>
#include <linux/math64.h>

#define TICK_SIZE (tick_nsec / 1000)

Expand All @@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);

extern int no_timer_check;

/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
/*
* We use the full linear equation: f(x) = a + b*x, in order to allow
* a continuous function in the face of dynamic freq changes.
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
* Continuity means that when our frequency changes our slope (b); we want to
* ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
* Without an offset (a) the above would not be possible.
*
* We can use khz divisor instead of mhz to keep a better precision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*
* In:
*
* ns = cycles * cyc2ns_scale / SC
*
* Although we may still have enough bits to store the value of ns,
* in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
* leading to an incorrect result.
*
* To avoid this, we can decompose 'cycles' into quotient and remainder
* of division by SC. Then,
*
* ns = (quot * SC + rem) * cyc2ns_scale / SC
* = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
*
* - sqazi@google.com
* See the comment near cycles_2_ns() for details on how we compute (b).
*/

DECLARE_PER_CPU(unsigned long, cyc2ns);
DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);

#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */

static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
(1UL << CYC2NS_SCALE_FACTOR));
return ns;
}

static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
unsigned long long ns;
unsigned long flags;

local_irq_save(flags);
ns = __cycles_2_ns(cyc);
local_irq_restore(flags);

return ns;
}
struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
u32 __count;
/* u32 hole */
}; /* 24 bytes -- do not grow */

extern struct cyc2ns_data *cyc2ns_read_begin(void);
extern void cyc2ns_read_end(struct cyc2ns_data *);

#endif /* _ASM_X86_TIMER_H */
23 changes: 0 additions & 23 deletions arch/x86/kernel/acpi/cstate.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);

/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);

__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
}

void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/cpu/amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}

#ifdef CONFIG_X86_64
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/cpu/intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
sched_clock_stable = 1;
set_sched_clock_stable();
}

/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
Expand Down
16 changes: 11 additions & 5 deletions arch/x86/kernel/cpu/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -1883,21 +1883,27 @@ static struct pmu pmu = {

void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;

userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;

if (!sched_clock_stable)
if (!sched_clock_stable())
return;

data = cyc2ns_read_begin();

userpg->cap_user_time = 1;
userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;

userpg->cap_user_time_zero = 1;
userpg->time_zero = this_cpu_read(cyc2ns_offset);
userpg->time_zero = data->cyc2ns_offset;

cyc2ns_read_end(data);
}

/*
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
* The WBINVD is insufficient due to the spurious-wakeup
* case where we return around the loop.
*/
mb();
clflush(mwait_ptr);
mb();
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
Expand Down
Loading

0 comments on commit a0fa1dd

Please sign in to comment.