diff --git a/bsd/conf/files.arm64 b/bsd/conf/files.arm64 index 7761c03ac..9d631c5fb 100644 --- a/bsd/conf/files.arm64 +++ b/bsd/conf/files.arm64 @@ -9,6 +9,9 @@ bsd/dev/arm/unix_signal.c standard bsd/dev/arm64/cpu_in_cksum.s standard +#if defined(KERNEL_INTEGRITY_CTRR) +bsd/tests/ctrr_test_sysctl.c optional config_xnupost +#endif /* defined(KERNEL_INTEGRITY_CTRR) */ bsd/dev/arm64/dtrace_isa.c optional config_dtrace bsd/dev/arm64/dtrace_subr_arm.c optional config_dtrace diff --git a/bsd/dev/arm64/dtrace_isa.c b/bsd/dev/arm64/dtrace_isa.c index 5714f7971..56d1729f5 100644 --- a/bsd/dev/arm64/dtrace_isa.c +++ b/bsd/dev/arm64/dtrace_isa.c @@ -54,6 +54,10 @@ typedef arm_saved_state_t savearea_t; extern lck_attr_t *dtrace_lck_attr; extern lck_grp_t *dtrace_lck_grp; +#if XNU_MONITOR +extern void * pmap_stacks_start; +extern void * pmap_stacks_end; +#endif struct frame { struct frame *backchain; @@ -455,6 +459,14 @@ dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit) } } +#if XNU_MONITOR +static inline boolean_t +dtrace_frame_in_ppl_stack(struct frame * fp) +{ + return ((void *)fp >= pmap_stacks_start) && + ((void *)fp < pmap_stacks_end); +} +#endif void dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, @@ -464,6 +476,9 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, struct frame *nextfp, *minfp, *stacktop; int depth = 0; int on_intr; +#if XNU_MONITOR + int on_ppl_stack; +#endif int last = 0; uintptr_t pc; uintptr_t caller = CPU->cpu_dtrace_caller; @@ -471,6 +486,11 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, if ((on_intr = CPU_ON_INTR(CPU)) != 0) { stacktop = (struct frame *) dtrace_get_cpu_int_stack_top(); } +#if XNU_MONITOR + else if ((on_ppl_stack = dtrace_frame_in_ppl_stack(fp))) { + stacktop = (struct frame *) pmap_stacks_end; + } +#endif else { stacktop = (struct frame *) (dtrace_get_kernel_stack(current_thread()) + kernel_stack_size); } @@ -496,6 +516,14 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, if (arm_kern_regs) { nextfp = (struct frame *)(saved_state64(arm_kern_regs)->fp); +#if XNU_MONITOR + on_ppl_stack = dtrace_frame_in_ppl_stack(nextfp); + + if (on_ppl_stack) { + minfp = pmap_stacks_start; + stacktop = pmap_stacks_end; + } else +#endif { vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread()); @@ -517,6 +545,30 @@ dtrace_getpcstack(pc_t * pcstack, int pcstack_limit, int aframes, last = 1; } } else { +#if XNU_MONITOR + if ((!on_ppl_stack) && dtrace_frame_in_ppl_stack(nextfp)) { + /* + * We are switching from the kernel stack + * to the PPL stack. + */ + on_ppl_stack = 1; + minfp = pmap_stacks_start; + stacktop = pmap_stacks_end; + } else if (on_ppl_stack) { + /* + * We could be going from the PPL stack + * to the kernel stack. + */ + vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread()); + + minfp = (struct frame *)kstack_base; + stacktop = (struct frame *)(kstack_base + kernel_stack_size); + + if (nextfp <= minfp || nextfp >= stacktop) { + last = 1; + } + } else +#endif { /* * This is the last frame we can process; indicate diff --git a/bsd/dev/arm64/sysctl.c b/bsd/dev/arm64/sysctl.c index d67aa4a0b..fd7055cb7 100644 --- a/bsd/dev/arm64/sysctl.c +++ b/bsd/dev/arm64/sysctl.c @@ -46,6 +46,30 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime, 0, 0, sysctl_wake_conttime, "I", "Continuous Time at the last wakeup"); +#if defined(HAS_IPI) +static int +cpu_signal_deferred_timer(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value = 0; + int changed = 0; + + int old_value = (int)ml_cpu_signal_deferred_get_timer(); + + int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); + + if (error == 0 && changed) { + ml_cpu_signal_deferred_adjust_timer((uint64_t)new_value); + } + + return error; +} + +SYSCTL_PROC(_machdep, OID_AUTO, deferred_ipi_timeout, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + cpu_signal_deferred_timer, "I", "Deferred IPI timeout (nanoseconds)"); + +#endif /* defined(HAS_IPI) */ /* * For source compatibility, here's some machdep.cpu mibs that diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 77c38ccbe..46fbd3ee5 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -66,6 +66,10 @@ #include +#if CONFIG_MACF +#include +#endif /* CONFIG_MACF */ + #if CONFIG_CSR #include #include @@ -221,7 +225,7 @@ collectth_state(thread_t th_act, void *tirp) * coredump_flags Extra options (ignore rlimit, run fsync) * * Returns: 0 Success - * EFAULT Failed + * !0 Failure errno * * IMPORTANT: This function can only be called on the current process, due * to assumptions below; see variable declaration section for @@ -252,7 +256,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) int error1 = 0; char stack_name[MAXCOMLEN + 6]; char *alloced_name = NULL; - char *name; + char *name = NULL; mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; vm_size_t mapsize; int i; @@ -276,11 +280,16 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) ((sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ ((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) || (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) { -#if CONFIG_AUDIT - audit_proc_coredump(core_proc, NULL, EFAULT); -#endif - return EFAULT; + error = EFAULT; + goto out2; + } + +#if CONFIG_MACF + error = mac_proc_check_dump_core(core_proc); + if (error != 0) { + goto out2; } +#endif #if CONFIG_CSR /* If the process is restricted, CSR isn't configured to allow @@ -289,10 +298,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) if (cs_restricted(core_proc) && csr_check(CSR_ALLOW_TASK_FOR_PID) && csr_check(CSR_ALLOW_APPLE_INTERNAL)) { -#if CONFIG_AUDIT - audit_proc_coredump(core_proc, NULL, EFAULT); -#endif - return EFAULT; + error = EPERM; + goto out2; } #endif @@ -306,7 +313,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) if (((coredump_flags & COREDUMP_IGNORE_ULIMIT) == 0) && (mapsize >= core_proc->p_rlimit[RLIMIT_CORE].rlim_cur)) { - return EFAULT; + error = EFAULT; + goto out2; } (void) task_suspend_internal(task); diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index afc9271dc..81f60d5e8 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -1321,7 +1321,11 @@ int max_jetsam_threads = JETSAM_THREADS_LIMIT; * - Raise the jetsam threshold ("clear-the-deck") * - Enabled parallel jetsam on eligible devices */ +#if __AMP__ +int fast_jetsam_enabled = 1; +#else /* __AMP__ */ int fast_jetsam_enabled = 0; +#endif /* __AMP__ */ /* Routine to find the jetsam state structure for the current jetsam thread */ static inline struct jetsam_thread_state * diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 0c10a3ac4..5d2fcee09 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -2132,6 +2132,28 @@ SYSCTL_PROC(_kern_perfcontrol_callout, OID_AUTO, update_cycles, (void *)PERFCONTROL_STAT_CYCLES, PERFCONTROL_CALLOUT_STATE_UPDATE, sysctl_perfcontrol_callout_stat, "I", ""); +#if __AMP__ +extern int sched_amp_idle_steal; +SYSCTL_INT(_kern, OID_AUTO, sched_amp_idle_steal, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &sched_amp_idle_steal, 0, ""); +extern int sched_amp_spill_steal; +SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_steal, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &sched_amp_spill_steal, 0, ""); +extern int sched_amp_spill_count; +SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_count, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &sched_amp_spill_count, 0, ""); +extern int sched_amp_spill_deferred_ipi; +SYSCTL_INT(_kern, OID_AUTO, sched_amp_spill_deferred_ipi, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &sched_amp_spill_deferred_ipi, 0, ""); +extern int sched_amp_pcores_preempt_immediate_ipi; +SYSCTL_INT(_kern, OID_AUTO, sched_amp_pcores_preempt_immediate_ipi, + CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, + &sched_amp_pcores_preempt_immediate_ipi, 0, ""); +#endif /* __AMP__ */ #endif /* __arm__ || __arm64__ */ #if __arm64__ diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index ee65deb6c..a07457cd6 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -116,8 +116,11 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) } if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) { - if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) { + int copy_error = copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy); + if (copy_error != 0 && copy_error != ENAMETOOLONG) { strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message) - 1); + } else { + message[sizeof(message) - 1] = '\0'; } } diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index de77a23be..ba02e1540 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -121,7 +121,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 58) +#if (MAC_POLICY_OPS_VERSION != 59) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -322,9 +322,9 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(proc_check_setlcid) CHECK_SET_HOOK(proc_check_signal) CHECK_SET_HOOK(proc_check_wait) + CHECK_SET_HOOK(proc_check_dump_core) .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook, CHECK_SET_HOOK(socket_check_accept) CHECK_SET_HOOK(socket_check_accepted) diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index d9fb9d1f9..bd2d1ad52 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -4024,6 +4024,104 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params, #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */ #if DEVELOPMENT || DEBUG +#if __AMP__ +#include +extern int32_t sysctl_get_bound_cpuid(void); +extern void sysctl_thread_bind_cpuid(int32_t cpuid); +static int +sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) { + return ENOENT; + } + + int32_t cpuid = sysctl_get_bound_cpuid(); + + int32_t new_value; + int changed; + int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed); + if (error) { + return error; + } + + if (changed) { + sysctl_thread_bind_cpuid(new_value); + } + + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_kern_sched_thread_bind_cpu, "I", ""); + +extern char sysctl_get_bound_cluster_type(void); +extern void sysctl_thread_bind_cluster_type(char cluster_type); +static int +sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + char buff[4]; + + if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) { + return ENOENT; + } + + int error = SYSCTL_IN(req, buff, 1); + if (error) { + return error; + } + char cluster_type = buff[0]; + + if (!req->newptr) { + goto out; + } + + sysctl_thread_bind_cluster_type(cluster_type); +out: + cluster_type = sysctl_get_bound_cluster_type(); + buff[0] = cluster_type; + + return SYSCTL_OUT(req, buff, 1); +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", ""); + +extern char sysctl_get_task_cluster_type(void); +extern void sysctl_task_set_cluster_type(char cluster_type); +static int +sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + char buff[4]; + + if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) { + return ENOENT; + } + + int error = SYSCTL_IN(req, buff, 1); + if (error) { + return error; + } + char cluster_type = buff[0]; + + if (!req->newptr) { + goto out; + } + + sysctl_task_set_cluster_type(cluster_type); +out: + cluster_type = sysctl_get_task_cluster_type(); + buff[0] = cluster_type; + + return SYSCTL_OUT(req, buff, 1); +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_kern_sched_task_set_cluster_type, "A", ""); +#endif /* __AMP__ */ #endif /* DEVELOPMENT || DEBUG */ extern uint32_t task_exc_guard_default; diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 8acee164d..d38af044c 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #if INET #include diff --git a/bsd/net/if_headless.c b/bsd/net/if_headless.c index f7ebb1776..02c935096 100644 --- a/bsd/net/if_headless.c +++ b/bsd/net/if_headless.c @@ -25,6 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +extern void if_headless_init(void); + void if_headless_init(void) { diff --git a/bsd/net/if_ipsec.c b/bsd/net/if_ipsec.c index 756272a89..0d1af9f9d 100644 --- a/bsd/net/if_ipsec.c +++ b/bsd/net/if_ipsec.c @@ -234,8 +234,6 @@ struct ipsec_pcb { #define IPSEC_FLAGS_KPIPE_ALLOCATED 1 /* data movement refcounting functions */ -static boolean_t ipsec_data_move_begin(struct ipsec_pcb *pcb); -static void ipsec_data_move_end(struct ipsec_pcb *pcb); static void ipsec_wait_data_move_drain(struct ipsec_pcb *pcb); /* Data path states */ @@ -2705,6 +2703,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, bpfattach(pcb->ipsec_ifp, DLT_NULL, 0); } +#if IPSEC_NEXUS /* * Mark the data path as ready. * If kpipe nexus is being used then the data path is marked ready only when a kpipe channel is connected. @@ -2714,6 +2713,7 @@ ipsec_ctl_connect(kern_ctl_ref kctlref, IPSEC_SET_DATA_PATH_READY(pcb); lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); } +#endif /* The interfaces resoures allocated, mark it as running */ ifnet_set_flags(pcb->ipsec_ifp, IFF_RUNNING, IFF_RUNNING); @@ -4083,34 +4083,6 @@ ipsec_set_ip6oa_for_interface(ifnet_t interface, struct ip6_out_args *ip6oa) } } -static boolean_t -ipsec_data_move_begin(struct ipsec_pcb *pcb) -{ - boolean_t ret = 0; - - lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock); - if ((ret = IPSEC_IS_DATA_PATH_READY(pcb))) { - pcb->ipsec_pcb_data_move++; - } - lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); - - return ret; -} - -static void -ipsec_data_move_end(struct ipsec_pcb *pcb) -{ - lck_mtx_lock_spin(&pcb->ipsec_pcb_data_move_lock); - VERIFY(pcb->ipsec_pcb_data_move > 0); - /* - * if there's no more thread moving data, wakeup any - * drainers that's blocked waiting for this. - */ - if (--pcb->ipsec_pcb_data_move == 0 && pcb->ipsec_pcb_drainers > 0) { - wakeup(&(pcb->ipsec_pcb_data_move)); - } - lck_mtx_unlock(&pcb->ipsec_pcb_data_move_lock); -} static void ipsec_data_move_drain(struct ipsec_pcb *pcb) diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index 84f3f4f48..5a5ab0961 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -2304,7 +2304,7 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet struct flow_divert_trie new_trie; int insert_error = 0; size_t nodes_mem_size; - int prefix_count = 0; + int prefix_count = -1; int signing_id_count = 0; size_t trie_memory_size = 0; @@ -2320,9 +2320,10 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet memset(&new_trie, 0, sizeof(new_trie)); /* Get the number of shared prefixes in the new set of signing ID strings */ - flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_PREFIX_COUNT, sizeof(prefix_count), &prefix_count, NULL); + error = flow_divert_packet_get_tlv(packet, offset, FLOW_DIVERT_TLV_PREFIX_COUNT, sizeof(prefix_count), &prefix_count, NULL); - if (prefix_count < 0) { + if (prefix_count < 0 || error) { + FDLOG(LOG_ERR, &nil_pcb, "Invalid prefix count (%d) or an error occurred while reading the prefix count: %d", prefix_count, error); lck_rw_done(&group->lck); return; } @@ -2332,7 +2333,12 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet cursor >= 0; cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) { uint32_t sid_size = 0; - flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); + error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); + if (error || sid_size == 0) { + FDLOG(LOG_ERR, &nil_pcb, "Failed to get the length of the signing identifier at offset %d: %d", cursor, error); + signing_id_count = 0; + break; + } new_trie.bytes_count += sid_size; signing_id_count++; } @@ -2382,6 +2388,7 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet new_trie.bytes = (uint8_t *)(void *)((uint8_t *)new_trie.memory + nodes_mem_size + child_maps_mem_size); new_trie.bytes_free_next = 0; + memset(new_trie.bytes, 0, bytes_mem_size); /* The root is an empty node */ new_trie.root = trie_node_alloc(&new_trie); @@ -2391,10 +2398,20 @@ flow_divert_handle_app_map_create(struct flow_divert_group *group, mbuf_t packet cursor >= 0; cursor = flow_divert_packet_find_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, &error, 1)) { uint32_t sid_size = 0; - flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); + error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, 0, NULL, &sid_size); + if (error || sid_size == 0) { + FDLOG(LOG_ERR, &nil_pcb, "Failed to get the length of the signing identifier at offset %d while building: %d", cursor, error); + insert_error = EINVAL; + break; + } if (new_trie.bytes_free_next + sid_size <= new_trie.bytes_count) { uint16_t new_node_idx; - flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, &TRIE_BYTE(&new_trie, new_trie.bytes_free_next), NULL); + error = flow_divert_packet_get_tlv(packet, cursor, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, &TRIE_BYTE(&new_trie, new_trie.bytes_free_next), NULL); + if (error) { + FDLOG(LOG_ERR, &nil_pcb, "Failed to read the signing identifier at offset %d: %d", cursor, error); + insert_error = EINVAL; + break; + } new_node_idx = flow_divert_trie_insert(&new_trie, new_trie.bytes_free_next, sid_size); if (new_node_idx == NULL_TRIE_IDX) { insert_error = EINVAL; diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index dad053c63..8d9241fc1 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1956,11 +1956,9 @@ ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) } if (sbappendaddr(&so->so_rcv, SA(dst), NULL, m_mtu, NULL) == 0) { - m_freem(m_mtu); - /* XXX: should count statistics */ - } else { - sorwakeup(so); + return; } + sorwakeup(so); } /* diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index 8f7da7ea0..90400cfa5 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -561,7 +561,6 @@ nfs_nget( { error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); } -notsup: if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); @@ -913,7 +912,6 @@ nfs_vnop_reclaim( { vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); - vfs_context_t ctx = ap->a_context; struct nfs_open_file *nofp, *nextnofp; struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop, *nextnlop; diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 51d151ecf..9c061a432 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -892,7 +892,7 @@ nfsm_chain_add_v2sattr_f(struct nfsm_chain *nmc, struct vnode_attr *vap, uint32_ */ int nfsm_chain_add_v3sattr_f( - struct nfsmount *nmp, + __unused struct nfsmount *nmp, struct nfsm_chain *nmc, struct vnode_attr *vap) { @@ -1124,7 +1124,7 @@ get_auxiliary_groups(kauth_cred_t cred, gid_t groups[NGROUPS], int count) } int -nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type, +nfsm_rpchead2(__unused struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type, kauth_cred_t cred, struct nfsreq *req, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp) { mbuf_t mreq, mb; @@ -1320,7 +1320,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in */ int nfs_parsefattr( - struct nfsmount *nmp, + __unused struct nfsmount *nmp, struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 67b409bae..902680c68 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -526,11 +526,10 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) * Return an NFS volume name from the mntfrom name. */ static void -nfs_get_volname(struct mount *mp, char *volname, size_t len, vfs_context_t ctx) +nfs_get_volname(struct mount *mp, char *volname, size_t len, __unused vfs_context_t ctx) { const char *ptr, *cptr; const char *mntfrom = mp->mnt_vfsstat.f_mntfromname; - struct nfsmount *nmp = VFSTONFS(mp); size_t mflen; diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index abb24e2fb..1df01abc0 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1238,7 +1238,7 @@ nfs_close( struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode, - vfs_context_t ctx) + __unused vfs_context_t ctx) { #if CONFIG_NFS4 struct nfs_lock_owner *nlop; @@ -1827,20 +1827,6 @@ nfs_getattr_internal(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, in return error; } -static int -nfs_parse_user_access( - mount_t mp, - enum vtype type) -{ - int user_access = R_OK; - if ((vfs_flags(mp) & MNT_RDONLY) == 0) { - user_access |= W_OK; - } - if (type == VDIR) { - user_access |= X_OK; - } - return user_access; -} /* * NFS getattr call from vfs. diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c index 5d6361cca..10e17d52e 100644 --- a/bsd/pthread/pthread_workqueue.c +++ b/bsd/pthread/pthread_workqueue.c @@ -1878,6 +1878,7 @@ bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority, return fixedpri_rv; } + return 0; } diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c index 6b1d11c5c..7ce85fdd7 100644 --- a/bsd/tests/bsd_tests.c +++ b/bsd/tests/bsd_tests.c @@ -51,6 +51,9 @@ extern kern_return_t arm64_lock_test(void); #endif kern_return_t kalloc_test(void); kern_return_t ipi_test(void); +#if defined(KERNEL_INTEGRITY_CTRR) +extern kern_return_t ctrr_test(void); +#endif #if __ARM_PAN_AVAILABLE__ extern kern_return_t arm64_late_pan_test(void); #endif @@ -63,6 +66,9 @@ struct xnupost_test bsd_post_tests[] = { #ifdef __arm64__ XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test), #endif +#if defined(KERNEL_INTEGRITY_CTRR) + XNUPOST_TEST_CONFIG_BASIC(ctrr_test), +#endif #if __ARM_PAN_AVAILABLE__ XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test), #endif diff --git a/bsd/tests/ctrr_test_sysctl.c b/bsd/tests/ctrr_test_sysctl.c index bea84e1ab..ffa15504f 100644 --- a/bsd/tests/ctrr_test_sysctl.c +++ b/bsd/tests/ctrr_test_sysctl.c @@ -28,3 +28,22 @@ #include +#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +extern kern_return_t ctrr_test(void); + +static int +sysctl_run_ctrr_test(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int dummy; + int error, changed; + error = sysctl_io_number(req, 0, sizeof(dummy), &dummy, &changed); + if (error || !changed) { + return error; + } + return ctrr_test(); +} + +SYSCTL_PROC(_kern, OID_AUTO, run_ctrr_test, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_run_ctrr_test, "I", ""); +#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index e1497887b..84627d65e 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -2800,6 +2800,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval) return error; } mp = nd.ni_vp->v_mount; + mount_ref(mp, 0); vnode_put(nd.ni_vp); nameidone(&nd); @@ -2874,6 +2875,7 @@ quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval) break; } /* switch */ + mount_drop(mp, 0); return error; } #else diff --git a/config/MasterVersion b/config/MasterVersion index b123f4ec5..3a06932c5 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -19.2.0 +19.3.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index ae3296b4e..9b3cef8ce 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -2072,7 +2072,8 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference, replyMsg.m.msg64.notifyHdr.size = sizeof(IOAsyncCompletionContent) + numArgs * sizeof(io_user_reference_t); replyMsg.m.msg64.notifyHdr.type = kIOAsyncCompletionNotificationType; - bcopy(reference, replyMsg.m.msg64.notifyHdr.reference, sizeof(OSAsyncReference64)); + /* Copy reference except for reference[0], which is left as 0 from the earlier bzero */ + bcopy(&reference[1], &replyMsg.m.msg64.notifyHdr.reference[1], sizeof(OSAsyncReference64) - sizeof(reference[0])); replyMsg.m.msg64.asyncContent.result = result; if (numArgs) { @@ -2089,7 +2090,8 @@ IOUserClient::_sendAsyncResult64(OSAsyncReference64 reference, + numArgs * sizeof(uint32_t); replyMsg.m.msg32.notifyHdr.type = kIOAsyncCompletionNotificationType; - for (idx = 0; idx < kOSAsyncRefCount; idx++) { + /* Skip reference[0] which is left as 0 from the earlier bzero */ + for (idx = 1; idx < kOSAsyncRefCount; idx++) { replyMsg.m.msg32.notifyHdr.reference[idx] = REF32(reference[idx]); } diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index 9f2b60169..965ba291d 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -101,7 +101,16 @@ int debug_task; boolean_t up_style_idle_exit = 0; +#if HAS_NEX_PG +uint32_t nex_pg = 1; +extern void set_nex_pg(void); +#endif +#if HAS_BP_RET +/* Enable both branch target retention (0x2) and branch direction retention (0x1) across sleep */ +uint32_t bp_ret = 3; +extern void set_bp_ret(void); +#endif #if INTERRUPT_MASKED_DEBUG boolean_t interrupt_masked_debug = 1; @@ -433,7 +442,15 @@ arm_init( PE_parse_boot_argn("interrupt_masked_debug_timeout", &interrupt_masked_timeout, sizeof(interrupt_masked_timeout)); #endif +#if HAS_NEX_PG + PE_parse_boot_argn("nexpg", &nex_pg, sizeof(nex_pg)); + set_nex_pg(); // Apply NEX powergating settings to boot CPU +#endif +#if HAS_BP_RET + PE_parse_boot_argn("bpret", &bp_ret, sizeof(bp_ret)); + set_bp_ret(); // Apply branch predictor retention settings to boot CPU +#endif PE_parse_boot_argn("immediate_NMI", &force_immediate_debug_halt, sizeof(force_immediate_debug_halt)); @@ -629,6 +646,14 @@ arm_init_cpu( mt_wake_per_core(); #endif /* MONOTONIC && defined(__arm64__) */ +#if defined(KERNEL_INTEGRITY_CTRR) + if (cpu_data_ptr->cluster_master) { + lck_spin_lock(&ctrr_cpu_start_lck); + ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] = 1; + thread_wakeup(&ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id]); + lck_spin_unlock(&ctrr_cpu_start_lck); + } +#endif slave_main(NULL); } diff --git a/osfmk/arm/atomic.h b/osfmk/arm/atomic.h index a6b4c2b8c..0382aa233 100644 --- a/osfmk/arm/atomic.h +++ b/osfmk/arm/atomic.h @@ -74,6 +74,12 @@ #define OS_ATOMIC_USE_LLSC 0 #endif +#if defined(__ARM_ARCH_8_4__) && defined(__arm64__) +/* on armv8.4 16-byte aligned load/store pair is atomic */ +#undef os_atomic_load_is_plain +#define os_atomic_load_is_plain(p) \ + (sizeof(*(p)) <= 16 && _Alignof(typeof(*(p))) >= sizeof(*(p))) +#endif /* * On armv7 & arm64, we do provide fine grained dependency injection, so diff --git a/osfmk/arm/cpu.c b/osfmk/arm/cpu.c index 72e8c7800..e641e72d9 100644 --- a/osfmk/arm/cpu.c +++ b/osfmk/arm/cpu.c @@ -377,6 +377,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->cpu_CLW_active = 0x1UL; #endif +#if !XNU_MONITOR pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data; pmap_cpu_data_ptr->cpu_user_pmap = (struct pmap *) NULL; @@ -386,6 +387,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) { pmap_cpu_data_ptr->cpu_asid_high_bits[i] = 0; } +#endif cpu_data_ptr->halt_status = CPU_NOT_HALTED; } @@ -421,7 +423,9 @@ cpu_start(int cpu) cpu_data_ptr = CpuDataEntries[cpu].cpu_data_vaddr; cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr; +#if !XNU_MONITOR cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL; +#endif if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) { first_thread = cpu_data_ptr->cpu_processor->startup_thread; diff --git a/osfmk/arm/cpu_common.c b/osfmk/arm/cpu_common.c index 327434ece..9d972f6e4 100644 --- a/osfmk/arm/cpu_common.c +++ b/osfmk/arm/cpu_common.c @@ -68,6 +68,9 @@ unsigned int real_ncpus = 1; boolean_t idle_enable = FALSE; uint64_t wake_abstime = 0x0ULL; +#if defined(HAS_IPI) +extern unsigned int gFastIPI; +#endif /* defined(HAS_IPI) */ cpu_data_t * cpu_datap(int cpu) @@ -419,9 +422,25 @@ cpu_signal_internal(cpu_data_t *target_proc, if (!(target_proc->cpu_signal & SIGPdisabled)) { if (defer) { +#if defined(HAS_IPI) + if (gFastIPI) { + ml_cpu_signal_deferred(target_proc->cpu_phys_id); + } else { + PE_cpu_signal_deferred(getCpuDatap()->cpu_id, target_proc->cpu_id); + } +#else PE_cpu_signal_deferred(getCpuDatap()->cpu_id, target_proc->cpu_id); +#endif /* defined(HAS_IPI) */ } else { +#if defined(HAS_IPI) + if (gFastIPI) { + ml_cpu_signal(target_proc->cpu_phys_id); + } else { + PE_cpu_signal(getCpuDatap()->cpu_id, target_proc->cpu_id); + } +#else PE_cpu_signal(getCpuDatap()->cpu_id, target_proc->cpu_id); +#endif /* defined(HAS_IPI) */ } } @@ -449,7 +468,15 @@ cpu_signal_cancel(cpu_data_t *target_proc) { /* TODO: Should we care about the state of a core as far as squashing deferred IPIs goes? */ if (!(target_proc->cpu_signal & SIGPdisabled)) { +#if defined(HAS_IPI) + if (gFastIPI) { + ml_cpu_signal_retract(target_proc->cpu_phys_id); + } else { + PE_cpu_signal_cancel(getCpuDatap()->cpu_id, target_proc->cpu_id); + } +#else PE_cpu_signal_cancel(getCpuDatap()->cpu_id, target_proc->cpu_id); +#endif /* defined(HAS_IPI) */ } } diff --git a/osfmk/arm/cpu_data_internal.h b/osfmk/arm/cpu_data_internal.h index 8b29c711a..98eac98c5 100644 --- a/osfmk/arm/cpu_data_internal.h +++ b/osfmk/arm/cpu_data_internal.h @@ -271,7 +271,9 @@ typedef struct cpu_data { uint32_t cpu_l3_id; uint32_t cpu_l3_size; +#if !XNU_MONITOR struct pmap_cpu_data cpu_pmap_cpu_data; +#endif dbgwrap_thread_state_t halt_state; enum { CPU_NOT_HALTED = 0, diff --git a/osfmk/arm/cpu_internal.h b/osfmk/arm/cpu_internal.h index 7a9892600..8e4a31454 100644 --- a/osfmk/arm/cpu_internal.h +++ b/osfmk/arm/cpu_internal.h @@ -74,5 +74,10 @@ extern unsigned int real_ncpus; extern void arm64_ipi_test(void); #endif /* defined(CONFIG_XNUPOST) && __arm64__ */ +#if defined(KERNEL_INTEGRITY_CTRR) +extern void init_ctrr_cpu_start_lock(void); +extern lck_spin_t ctrr_cpu_start_lck; +extern bool ctrr_cluster_locked[__ARM_CLUSTER_COUNT__]; +#endif /* defined(KERNEL_INTEGRITY_CTRR) */ #endif /* _ARM_CPU_INTERNAL_H_ */ diff --git a/osfmk/arm/cpuid.c b/osfmk/arm/cpuid.c index 73f9b0d83..f976aea35 100644 --- a/osfmk/arm/cpuid.c +++ b/osfmk/arm/cpuid.c @@ -185,6 +185,12 @@ cpuid_get_cpufamily(void) case CPU_PART_TEMPEST_ARUBA: cpufamily = CPUFAMILY_ARM_VORTEX_TEMPEST; break; +#ifndef RC_HIDE_XNU_LIGHTNING + case CPU_PART_LIGHTNING: + case CPU_PART_THUNDER: + cpufamily = CPUFAMILY_ARM_LIGHTNING_THUNDER; + break; +#endif /* !RC_HIDE_XNU_LIGHTNING */ default: cpufamily = CPUFAMILY_UNKNOWN; break; diff --git a/osfmk/arm/cpuid.h b/osfmk/arm/cpuid.h index 559cde9fc..74aac691d 100644 --- a/osfmk/arm/cpuid.h +++ b/osfmk/arm/cpuid.h @@ -154,6 +154,14 @@ typedef union { /* H11G e-Core (ARMv8 architecture) */ #define CPU_PART_TEMPEST_ARUBA 0x11 +#ifndef RC_HIDE_XNU_LIGHTNING +/* H12 p-Core (ARMv8 architecture) */ +#define CPU_PART_LIGHTNING 0x12 + +/* H12 e-Core (ARMv8 architecture) */ +#define CPU_PART_THUNDER 0x13 + +#endif /* !RC_HIDE_XNU_LIGHTNING */ /* Cache type identification */ diff --git a/osfmk/arm/data.s b/osfmk/arm/data.s index 917e68c2f..b7e66378a 100644 --- a/osfmk/arm/data.s +++ b/osfmk/arm/data.s @@ -106,14 +106,14 @@ LEXT(vfptrash_data) #if __arm64__ .section __DATA, __const -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* reserve space for read only page tables */ .align 14 LEXT(ropagetable_begin) .space 14*16*1024,0 #else LEXT(ropagetable_begin) -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ LEXT(ropagetable_end) diff --git a/osfmk/arm/machine_routines.h b/osfmk/arm/machine_routines.h index db581e897..759802bdd 100644 --- a/osfmk/arm/machine_routines.h +++ b/osfmk/arm/machine_routines.h @@ -613,10 +613,10 @@ unsigned long monitor_call(uintptr_t callnum, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); #endif /* MONITOR */ -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) void rorgn_stash_range(void); void rorgn_lockdown(void); -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ #if __ARM_KERNEL_PROTECT__ extern void set_vbar_el1(uint64_t); diff --git a/osfmk/arm/machine_routines_common.c b/osfmk/arm/machine_routines_common.c index 02f733910..b433dd658 100644 --- a/osfmk/arm/machine_routines_common.c +++ b/osfmk/arm/machine_routines_common.c @@ -55,7 +55,12 @@ extern boolean_t interrupt_masked_debug; extern uint64_t interrupt_masked_timeout; #endif +#if !HAS_CONTINUOUS_HWCLOCK extern uint64_t mach_absolutetime_asleep; +#else +extern uint64_t wake_abstime; +static uint64_t wake_conttime = UINT64_MAX; +#endif static void sched_perfcontrol_oncore_default(perfcontrol_state_t new_thread_state __unused, going_on_core_t on __unused) @@ -686,26 +691,49 @@ ml_get_abstime_offset(void) uint64_t ml_get_conttime_offset(void) { +#if HAS_CONTINUOUS_HWCLOCK + return 0; +#else return rtclock_base_abstime + mach_absolutetime_asleep; +#endif } uint64_t ml_get_time_since_reset(void) { +#if HAS_CONTINUOUS_HWCLOCK + if (wake_conttime == UINT64_MAX) { + return UINT64_MAX; + } else { + return mach_continuous_time() - wake_conttime; + } +#else /* The timebase resets across S2R, so just return the raw value. */ return ml_get_hwclock(); +#endif } void ml_set_reset_time(__unused uint64_t wake_time) { +#if HAS_CONTINUOUS_HWCLOCK + wake_conttime = wake_time; +#endif } uint64_t ml_get_conttime_wake_time(void) { +#if HAS_CONTINUOUS_HWCLOCK + /* + * For now, we will reconstitute the timebase value from + * cpu_timebase_init and use it as the wake time. + */ + return wake_abstime - ml_get_abstime_offset(); +#else /* HAS_CONTINOUS_HWCLOCK */ /* The wake time is simply our continuous time offset. */ return ml_get_conttime_offset(); +#endif /* HAS_CONTINOUS_HWCLOCK */ } /* diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index f40eeb88c..003be491f 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -537,10 +537,135 @@ int pmap_stats_assert = 1; #endif /* DEVELOPMENT || DEBUG */ +#if XNU_MONITOR +/* + * PPL External References. + */ +extern vm_offset_t segPPLDATAB; +extern unsigned long segSizePPLDATA; +extern vm_offset_t segPPLTEXTB; +extern unsigned long segSizePPLTEXT; +#if __APRR_SUPPORTED__ +extern vm_offset_t segPPLTRAMPB; +extern unsigned long segSizePPLTRAMP; +extern void ppl_trampoline_start; +extern void ppl_trampoline_end; +#endif +extern vm_offset_t segPPLDATACONSTB; +extern unsigned long segSizePPLDATACONST; + + +/* + * PPL Global Variables + */ + +#if (DEVELOPMENT || DEBUG) +/* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */ +SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE; +#else +const boolean_t pmap_ppl_disable = FALSE; +#endif + +/* Indicates if the PPL has started applying APRR. */ +boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE; + +/* + * The PPL cannot invoke the kernel in order to allocate memory, so we must + * maintain a list of free pages that the PPL owns. The kernel can give the PPL + * additional pages. + */ +decl_simple_lock_data(, pmap_ppl_free_page_lock MARK_AS_PMAP_DATA); +void ** pmap_ppl_free_page_list MARK_AS_PMAP_DATA = NULL; +uint64_t pmap_ppl_free_page_count MARK_AS_PMAP_DATA = 0; +uint64_t pmap_ppl_pages_returned_to_kernel_count_total = 0; + +struct pmap_cpu_data_array_entry pmap_cpu_data_array[MAX_CPUS] MARK_AS_PMAP_DATA; + +#ifdef CPU_CLUSTER_OFFSETS +const uint64_t pmap_cluster_offsets[] = CPU_CLUSTER_OFFSETS; +_Static_assert((sizeof(pmap_cluster_offsets) / sizeof(pmap_cluster_offsets[0])) == __ARM_CLUSTER_COUNT__, + "pmap_cluster_offsets[] count does not match __ARM_CLUSTER_COUNT__"); +#endif + +extern void *pmap_stacks_start; +extern void *pmap_stacks_end; +SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_start_pa = 0; +SECURITY_READ_ONLY_LATE(pmap_paddr_t) pmap_stacks_end_pa = 0; +SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_start = 0; +SECURITY_READ_ONLY_LATE(pmap_paddr_t) ppl_cpu_save_area_end = 0; + +/* Allocation data/locks for pmap structures. */ +decl_simple_lock_data(, pmap_free_list_lock MARK_AS_PMAP_DATA); +SECURITY_READ_ONLY_LATE(unsigned long) pmap_array_count = 0; +SECURITY_READ_ONLY_LATE(void *) pmap_array_begin = NULL; +SECURITY_READ_ONLY_LATE(void *) pmap_array_end = NULL; +SECURITY_READ_ONLY_LATE(pmap_t) pmap_array = NULL; +pmap_t pmap_free_list MARK_AS_PMAP_DATA = NULL; + +/* Allocation data/locks/structs for task ledger structures. */ +#define PMAP_LEDGER_DATA_BYTES \ + (((sizeof(task_ledgers) / sizeof(int)) * sizeof(struct ledger_entry)) + sizeof(struct ledger)) + +/* + * Maximum number of ledgers allowed are maximum number of tasks + * allowed on system plus some more i.e. ~10% of total tasks = 200. + */ +#define MAX_PMAP_LEDGERS (MAX_ASID + 200) + +typedef struct pmap_ledger_data { + char pld_data[PMAP_LEDGER_DATA_BYTES]; +} pmap_ledger_data_t; + +typedef struct pmap_ledger { + union { + struct pmap_ledger_data ple_data; + struct pmap_ledger * next; + }; + + struct pmap_ledger ** back_ptr; +} pmap_ledger_t; + +SECURITY_READ_ONLY_LATE(bool) pmap_ledger_alloc_initialized = false; +decl_simple_lock_data(, pmap_ledger_lock MARK_AS_PMAP_DATA); +SECURITY_READ_ONLY_LATE(void *) pmap_ledger_refcnt_begin = NULL; +SECURITY_READ_ONLY_LATE(void *) pmap_ledger_refcnt_end = NULL; +SECURITY_READ_ONLY_LATE(os_refcnt_t *) pmap_ledger_refcnt = NULL; +SECURITY_READ_ONLY_LATE(void *) pmap_ledger_ptr_array_begin = NULL; +SECURITY_READ_ONLY_LATE(void *) pmap_ledger_ptr_array_end = NULL; +SECURITY_READ_ONLY_LATE(pmap_ledger_t * *) pmap_ledger_ptr_array = NULL; +uint64_t pmap_ledger_ptr_array_free_index MARK_AS_PMAP_DATA = 0; +pmap_ledger_t * pmap_ledger_free_list MARK_AS_PMAP_DATA = NULL; + +#define pmap_ledger_debit(p, e, a) ledger_debit_nocheck((p)->ledger, e, a) +#define pmap_ledger_credit(p, e, a) ledger_credit_nocheck((p)->ledger, e, a) + +static inline void +pmap_check_ledger_fields(ledger_t ledger) +{ + if (ledger == NULL) { + return; + } + + thread_t cur_thread = current_thread(); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.alternate_accounting_compressed); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.internal_compressed); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.page_table); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_footprint); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.phys_mem); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.tkm_private); + ledger_check_new_balance(cur_thread, ledger, task_ledgers.wired_mem); +} + +#define pmap_ledger_check_balance(p) pmap_check_ledger_fields((p)->ledger) + +#else /* XNU_MONITOR */ #define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a) #define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a) +#endif /* !XNU_MONITOR */ #if DEVELOPMENT || DEBUG int panic_on_unsigned_execute = 0; @@ -799,6 +924,29 @@ typedef u_int16_t pp_attr_t; #define PP_ATTR_REFFAULT 0x1000 #define PP_ATTR_MODFAULT 0x2000 +#if XNU_MONITOR +/* + * Denotes that a page is owned by the PPL. This is modified/checked with the + * PVH lock held, to avoid ownership related races. This does not need to be a + * PP_ATTR bit (as we have the lock), but for now this is a convenient place to + * put the bit. + */ +#define PP_ATTR_MONITOR 0x4000 + +/* + * Denotes that a page *cannot* be owned by the PPL. This is required in order + * to temporarily 'pin' kernel pages that are used to store PPL output parameters. + * Otherwise a malicious or buggy caller could pass PPL-owned memory for these + * parameters and in so doing stage a write gadget against the PPL. + */ +#define PP_ATTR_NO_MONITOR 0x8000 + +/* + * All of the bits owned by the PPL; kernel requests to set or clear these bits + * are illegal. + */ +#define PP_ATTR_PPL_OWNED_BITS (PP_ATTR_MONITOR | PP_ATTR_NO_MONITOR) +#endif SECURITY_READ_ONLY_LATE(pp_attr_t*) pp_attr_table; @@ -840,6 +988,14 @@ static bitmap_t asid_bitmap[BITMAP_LEN(MAX_ASID)] MARK_AS_PMAP_DATA; SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #endif +#if XNU_MONITOR +/* + * We define our target as 8 pages; enough for 2 page table pages, a PTD page, + * and a PV page; in essence, twice as many pages as may be necessary to satisfy + * a single pmap_enter request. + */ +#define PMAP_MIN_FREE_PPL_PAGES 8 +#endif #define pa_index(pa) \ (atop((pa) - vm_first_phys)) @@ -1105,6 +1261,25 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define pa_clear_reference(x) \ pa_clear_bits(x, PP_ATTR_REFERENCED) +#if XNU_MONITOR +#define pa_set_monitor(x) \ + pa_set_bits((x), PP_ATTR_MONITOR) + +#define pa_clear_monitor(x) \ + pa_clear_bits((x), PP_ATTR_MONITOR) + +#define pa_test_monitor(x) \ + pa_test_bits((x), PP_ATTR_MONITOR) + +#define pa_set_no_monitor(x) \ + pa_set_bits((x), PP_ATTR_NO_MONITOR) + +#define pa_clear_no_monitor(x) \ + pa_clear_bits((x), PP_ATTR_NO_MONITOR) + +#define pa_test_no_monitor(x) \ + pa_test_bits((x), PP_ATTR_NO_MONITOR) +#endif #define IS_INTERNAL_PAGE(pai) \ ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_INTERNAL) @@ -1292,11 +1467,61 @@ lck_grp_t pmap_lck_grp; #define current_pmap() \ (vm_map_pmap(current_thread()->map)) +#if XNU_MONITOR +/* + * PPL-related macros. + */ +#define ARRAY_ELEM_PTR_IS_VALID(_ptr_, _elem_size_, _array_begin_, _array_end_) \ + (((_ptr_) >= (typeof(_ptr_))_array_begin_) && \ + ((_ptr_) < (typeof(_ptr_))_array_end_) && \ + !((((void *)(_ptr_)) - ((void *)_array_begin_)) % (_elem_size_))) + +#define PMAP_PTR_IS_VALID(x) ARRAY_ELEM_PTR_IS_VALID(x, sizeof(struct pmap), pmap_array_begin, pmap_array_end) + +#define USER_PMAP_IS_VALID(x) (PMAP_PTR_IS_VALID(x) && (os_atomic_load(&(x)->ref_count, relaxed) > 0)) + +#define VALIDATE_USER_PMAP(x) \ + if (__improbable(!USER_PMAP_IS_VALID(x))) \ + panic("%s: invalid pmap %p", __func__, (x)); + +#define VALIDATE_PMAP(x) \ + if (__improbable(((x) != kernel_pmap) && !USER_PMAP_IS_VALID(x))) \ + panic("%s: invalid pmap %p", __func__, (x)); + +#define VALIDATE_LEDGER_PTR(x) \ + if (__improbable(!ARRAY_ELEM_PTR_IS_VALID(x, sizeof(void *), pmap_ledger_ptr_array_begin, pmap_ledger_ptr_array_end))) \ + panic("%s: invalid ledger ptr %p", __func__, (x)); + +#define ARRAY_ELEM_INDEX(x, _elem_size_, _array_begin_) ((uint64_t)((((void *)(x)) - (_array_begin_)) / (_elem_size_))) + +static uint64_t +pmap_ledger_validate(void * ledger) +{ + uint64_t array_index; + pmap_ledger_t ** ledger_ptr_array_ptr = ((pmap_ledger_t*)ledger)->back_ptr; + VALIDATE_LEDGER_PTR(ledger_ptr_array_ptr); + array_index = ARRAY_ELEM_INDEX(ledger_ptr_array_ptr, sizeof(pmap_ledger_t *), pmap_ledger_ptr_array_begin); + + if (array_index >= MAX_PMAP_LEDGERS) { + panic("%s: ledger %p array index invalid, index was %#llx", __func__, ledger, array_index); + } + + pmap_ledger_t *ledger_ptr = *ledger_ptr_array_ptr; + + if (__improbable(ledger_ptr != ledger)) { + panic("%s: ledger pointer mismatch, %p != %p", __func__, ledger, ledger_ptr); + } + + return array_index; +} + +#else /* XNU_MONITOR */ #define VALIDATE_USER_PMAP(x) #define VALIDATE_PMAP(x) #define VALIDATE_LEDGER(x) +#endif #if DEVELOPMENT || DEBUG @@ -1469,9 +1694,43 @@ static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes); static void pmap_trim_self(pmap_t pmap); static void pmap_trim_subord(pmap_t subord); +#if __APRR_SUPPORTED__ +static uint64_t pte_to_xprr_perm(pt_entry_t pte); +static pt_entry_t xprr_perm_to_pte(uint64_t perm); +#endif /* __APRR_SUPPORTED__*/ + +#if XNU_MONITOR +static pmap_paddr_t pmap_alloc_page_for_kern(void); +static void pmap_alloc_page_for_ppl(void); + + +/* + * This macro generates prototypes for the *_internal functions, which + * represent the PPL interface. When the PPL is enabled, this will also + * generate prototypes for the PPL entrypoints (*_ppl), as well as generating + * the entrypoints. + */ +#define GEN_ASM_NAME(__function_name) _##__function_name##_ppl + +#define PMAP_SUPPORT_PROTOTYPES_WITH_ASM_INTERNAL(__return_type, __function_name, __function_args, __function_index, __assembly_function_name) \ + static __return_type __function_name##_internal __function_args; \ + extern __return_type __function_name##_ppl __function_args; \ + __asm__ (".text \n" \ + ".align 2 \n" \ + ".globl " #__assembly_function_name "\n" \ + #__assembly_function_name ":\n" \ + "mov x15, " #__function_index "\n" \ + "b _aprr_ppl_enter\n") + +#define PMAP_SUPPORT_PROTOTYPES_WITH_ASM(__return_type, __function_name, __function_args, __function_index, __assembly_function_name) \ + PMAP_SUPPORT_PROTOTYPES_WITH_ASM_INTERNAL(__return_type, __function_name, __function_args, __function_index, __assembly_function_name) +#define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \ + PMAP_SUPPORT_PROTOTYPES_WITH_ASM(__return_type, __function_name, __function_args, __function_index, GEN_ASM_NAME(__function_name)) +#else /* XNU_MONITOR */ #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \ static __return_type __function_name##_internal __function_args +#endif /* XNU_MONITOR */ PMAP_SUPPORT_PROTOTYPES( kern_return_t, @@ -1628,7 +1887,7 @@ PMAP_SUPPORT_PROTOTYPES( void, pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX); -#if MACH_ASSERT +#if MACH_ASSERT || XNU_MONITOR PMAP_SUPPORT_PROTOTYPES( void, pmap_set_process, (pmap_t pmap, @@ -1647,12 +1906,22 @@ PMAP_SUPPORT_PROTOTYPES( uint64_t size, unsigned int option), PMAP_UNNEST_OPTIONS_INDEX); +#if XNU_MONITOR +PMAP_SUPPORT_PROTOTYPES( + void, + pmap_cpu_data_init, (unsigned int cpu_number), PMAP_CPU_DATA_INIT_INDEX); +#endif PMAP_SUPPORT_PROTOTYPES( void, phys_attribute_set, (ppnum_t pn, unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX); +#if XNU_MONITOR +PMAP_SUPPORT_PROTOTYPES( + void, + pmap_mark_page_as_ppl_page, (pmap_paddr_t pa), PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX); +#endif PMAP_SUPPORT_PROTOTYPES( void, @@ -1673,6 +1942,11 @@ PMAP_SUPPORT_PROTOTYPES( void, pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX); +#if XNU_MONITOR +PMAP_SUPPORT_PROTOTYPES( + uint64_t, + pmap_release_ppl_pages_to_kernel, (void), PMAP_RELEASE_PAGES_TO_KERNEL_INDEX); +#endif PMAP_SUPPORT_PROTOTYPES( void, @@ -1686,10 +1960,21 @@ PMAP_SUPPORT_PROTOTYPES( addr64_t nstart, uint64_t size), PMAP_TRIM_INDEX); +#if HAS_APPLE_PAC && XNU_MONITOR +PMAP_SUPPORT_PROTOTYPES( + void *, + pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator), PMAP_SIGN_USER_PTR); +PMAP_SUPPORT_PROTOTYPES( + void *, + pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator), PMAP_AUTH_USER_PTR); +#endif /* HAS_APPLE_PAC && XNU_MONITOR */ +#if XNU_MONITOR +static void pmap_mark_page_as_ppl_page(pmap_paddr_t pa); +#endif void pmap_footprint_suspend(vm_map_t map, boolean_t suspend); @@ -1699,6 +1984,22 @@ PMAP_SUPPORT_PROTOTYPES( boolean_t suspend), PMAP_FOOTPRINT_SUSPEND_INDEX); +#if XNU_MONITOR +PMAP_SUPPORT_PROTOTYPES( + void, + pmap_ledger_alloc_init, (size_t), + PMAP_LEDGER_ALLOC_INIT_INDEX); + +PMAP_SUPPORT_PROTOTYPES( + ledger_t, + pmap_ledger_alloc, (void), + PMAP_LEDGER_ALLOC_INDEX); + +PMAP_SUPPORT_PROTOTYPES( + void, + pmap_ledger_free, (ledger_t), + PMAP_LEDGER_FREE_INDEX); +#endif #if CONFIG_PGTRACE boolean_t pgtrace_enabled = 0; @@ -1756,6 +2057,91 @@ long long alloc_pmap_pages_count __attribute__((aligned(8))) = 0LL; int pt_fake_zone_index = -1; /* index of pmap fake zone */ +#if XNU_MONITOR +/* + * Table of function pointers used for PPL dispatch. + */ +const void * const ppl_handler_table[PMAP_COUNT] = { + [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal, + [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal, + [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal, + [MAPPING_REPLENISH_INDEX] = mapping_replenish_internal, + [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal, + [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal, + [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal, + [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal, + [PMAP_CREATE_INDEX] = pmap_create_options_internal, + [PMAP_DESTROY_INDEX] = pmap_destroy_internal, + [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal, + [PMAP_EXTRACT_INDEX] = pmap_extract_internal, + [PMAP_FIND_PHYS_INDEX] = pmap_find_phys_internal, + [PMAP_INSERT_SHAREDPAGE_INDEX] = pmap_insert_sharedpage_internal, + [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal, + [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal, + [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal, + [PMAP_NEST_INDEX] = pmap_nest_internal, + [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal, + [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal, + [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal, + [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal, + [PMAP_REFERENCE_INDEX] = pmap_reference_internal, + [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal, + [PMAP_RETURN_INDEX] = pmap_return_internal, + [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal, + [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal, + [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal, + [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal, + [PMAP_SWITCH_INDEX] = pmap_switch_internal, + [PMAP_SWITCH_USER_TTB_INDEX] = pmap_switch_user_ttb_internal, + [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal, + [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal, + [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal, + [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal, + [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal, + [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal, + [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal, + [PMAP_TRIM_INDEX] = pmap_trim_internal, + [PMAP_LEDGER_ALLOC_INIT_INDEX] = pmap_ledger_alloc_init_internal, + [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal, + [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal, +#if HAS_APPLE_PAC && XNU_MONITOR + [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal, + [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal, +#endif /* HAS_APPLE_PAC && XNU_MONITOR */ +}; + +static uint64_t +pmap_get_ppl_cpu_id(void) +{ + uint64_t mpidr_el1_value = 0; + + /* We identify the CPU based on the constant bits of MPIDR_EL1. */ + MRS(mpidr_el1_value, "MPIDR_EL1"); + +#ifdef CPU_CLUSTER_OFFSETS + uint64_t cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT; + assert(cluster_id < (sizeof(pmap_cluster_offsets) / sizeof(pmap_cluster_offsets[0]))); + + /* For multi-cluster configurations, AFF0 reflects the core number within the cluster. */ + mpidr_el1_value = (mpidr_el1_value & MPIDR_AFF0_MASK) + pmap_cluster_offsets[cluster_id]; +#else + /* + * AFF2 is not constant (it can change for e-core versus p-core on H9), + * so mask it out. + */ + mpidr_el1_value &= MPIDR_AFF0_MASK; +#endif + + if (mpidr_el1_value > MAX_CPUS) { + panic("%s: mpidr_el1_value=%#llx > MAX_CPUS=%#x", + __FUNCTION__, mpidr_el1_value, MAX_CPUS); + } + + return mpidr_el1_value; +} + + +#endif /* @@ -1766,18 +2152,80 @@ pmap_cpu_data_init_internal(unsigned int cpu_number) { pmap_cpu_data_t * pmap_cpu_data = pmap_get_cpu_data(); +#if XNU_MONITOR + /* Verify cacheline-aligned */ + assert(((vm_offset_t)pmap_cpu_data & ((1 << L2_CLINE) - 1)) == 0); + if (pmap_cpu_data->cpu_number != PMAP_INVALID_CPU_NUM) { + panic("%s: pmap_cpu_data->cpu_number=%u, " + "cpu_number=%u", + __FUNCTION__, pmap_cpu_data->cpu_number, + cpu_number); + } +#endif pmap_cpu_data->cpu_number = cpu_number; } void pmap_cpu_data_init(void) { +#if XNU_MONITOR + pmap_cpu_data_init_ppl(cpu_number()); +#else pmap_cpu_data_init_internal(cpu_number()); +#endif } static void pmap_cpu_data_array_init(void) { +#if XNU_MONITOR + unsigned int i = 0; + pmap_paddr_t ppl_cpu_save_area_cur = 0; + pt_entry_t template, *pte_p; + vm_offset_t stack_va = (vm_offset_t)pmap_stacks_start + ARM_PGBYTES; + assert((pmap_stacks_start != NULL) && (pmap_stacks_end != NULL)); + pmap_stacks_start_pa = avail_start; + + for (i = 0; i < MAX_CPUS; i++) { + for (vm_offset_t cur_va = stack_va; cur_va < (stack_va + PPL_STACK_SIZE); cur_va += ARM_PGBYTES) { + assert(cur_va < (vm_offset_t)pmap_stacks_end); + pte_p = pmap_pte(kernel_pmap, cur_va); + assert(*pte_p == ARM_PTE_EMPTY); + template = pa_to_pte(avail_start) | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_TYPE | + ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | xprr_perm_to_pte(XPRR_PPL_RW_PERM); +#if __ARM_KERNEL_PROTECT__ + template |= ARM_PTE_NG; +#endif /* __ARM_KERNEL_PROTECT__ */ + WRITE_PTE(pte_p, template); + __builtin_arm_isb(ISB_SY); + avail_start += ARM_PGBYTES; + } +#if KASAN + kasan_map_shadow(stack_va, PPL_STACK_SIZE, false); +#endif + pmap_cpu_data_array[i].cpu_data.cpu_id = i; + pmap_cpu_data_array[i].cpu_data.cpu_number = PMAP_INVALID_CPU_NUM; + pmap_cpu_data_array[i].cpu_data.ppl_state = PPL_STATE_KERNEL; + pmap_cpu_data_array[i].cpu_data.ppl_stack = (void*)(stack_va + PPL_STACK_SIZE); + stack_va += (PPL_STACK_SIZE + ARM_PGBYTES); + } + sync_tlb_flush(); + pmap_stacks_end_pa = avail_start; + + ppl_cpu_save_area_start = avail_start; + ppl_cpu_save_area_end = ppl_cpu_save_area_start; + ppl_cpu_save_area_cur = ppl_cpu_save_area_start; + + for (i = 0; i < MAX_CPUS; i++) { + while ((ppl_cpu_save_area_end - ppl_cpu_save_area_cur) < sizeof(arm_context_t)) { + avail_start += PAGE_SIZE; + ppl_cpu_save_area_end = avail_start; + } + + pmap_cpu_data_array[i].cpu_data.save_area = (arm_context_t *)phystokv(ppl_cpu_save_area_cur); + ppl_cpu_save_area_cur += sizeof(arm_context_t); + } +#endif pmap_cpu_data_init(); } @@ -1787,11 +2235,208 @@ pmap_get_cpu_data(void) { pmap_cpu_data_t * pmap_cpu_data = NULL; +#if XNU_MONITOR + uint64_t cpu_id = 0; + + cpu_id = pmap_get_ppl_cpu_id(); + pmap_cpu_data = &pmap_cpu_data_array[cpu_id].cpu_data; + + if (pmap_cpu_data->cpu_id != cpu_id) { + panic("%s: CPU ID mismatch, cpu_id=0x%#llx, pmap_cpu_data->cpu_id=%#llx", + __FUNCTION__, cpu_id, pmap_cpu_data->cpu_id); + } +#else pmap_cpu_data = &getCpuDatap()->cpu_pmap_cpu_data; +#endif return pmap_cpu_data; } +#if XNU_MONITOR +/* + * pmap_set_range_xprr_perm takes a range (specified using start and end) that + * falls within the physical aperture. All mappings within this range have + * their protections changed from those specified by the expected_perm to those + * specified by the new_perm. + */ +static void +pmap_set_range_xprr_perm(vm_address_t start, + vm_address_t end, + unsigned int expected_perm, + unsigned int new_perm) +{ +#if (__ARM_VMSA__ == 7) +#error This function is not supported on older ARM hardware +#else + pmap_t pmap = NULL; + + vm_address_t va = 0; + vm_address_t tte_start = 0; + vm_address_t tte_end = 0; + + tt_entry_t *tte_p = NULL; + pt_entry_t *pte_p = NULL; + pt_entry_t *cpte_p = NULL; + pt_entry_t *bpte_p = NULL; + pt_entry_t *epte_p = NULL; + + tt_entry_t tte = 0; + pt_entry_t cpte = 0; + pt_entry_t template = 0; + + pmap = kernel_pmap; + + va = start; + + /* + * Validate our arguments; any invalid argument will be grounds for a + * panic. + */ + if ((start | end) % ARM_PGBYTES) { + panic("%s: start or end not page aligned, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + if (start > end) { + panic("%s: start > end, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + if (start < gVirtBase) { + panic("%s: start is before physical aperture, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + if (end > static_memory_end) { + panic("%s: end is after physical aperture, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + if ((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM)) { + panic("%s: invalid XPRR index, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + /* + * Walk over the PTEs for the given range, and set the protections on + * those PTEs. + */ + while (va < end) { + tte_start = va; + tte_end = ((va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr)); + + if (tte_end > end) { + tte_end = end; + } + + tte_p = pmap_tte(pmap, va); + + /* + * The physical aperture should not have holes. + * The physical aperture should be contiguous. + * Do not make eye contact with the physical aperture. + */ + if (tte_p == NULL) { + panic("%s: physical aperture tte is NULL, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)start, (void *)end, new_perm, expected_perm); + } + + tte = *tte_p; + + if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { + /* + * Walk over the given L3 page table page and update the + * PTEs. + */ + pte_p = (pt_entry_t *)ttetokv(tte); + bpte_p = &pte_p[ptenum(va)]; + epte_p = bpte_p + ((tte_end - va) >> pt_attr_leaf_shift(native_pt_attr)); + + for (cpte_p = bpte_p; cpte_p < epte_p; + cpte_p += PAGE_SIZE / ARM_PGBYTES, va += PAGE_SIZE) { + int pai = (int)pa_index(pte_to_pa(*cpte_p)); + LOCK_PVH(pai); + cpte = *cpte_p; + + /* + * Every PTE involved should be valid, should + * not have the hint bit set, and should have + * Every valid PTE involved should + * not have the hint bit set and should have + * the expected APRR index. + */ + if ((cpte & ARM_PTE_TYPE_MASK) == + ARM_PTE_TYPE_FAULT) { + panic("%s: physical aperture PTE is invalid, va=%p, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)va, + (void *)start, (void *)end, new_perm, expected_perm); + UNLOCK_PVH(pai); + continue; + } + + if (cpte & ARM_PTE_HINT_MASK) { + panic("%s: physical aperture PTE has hint bit set, va=%p, cpte=0x%llx, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + (void *)va, cpte, + (void *)start, (void *)end, new_perm, expected_perm); + } + + if (pte_to_xprr_perm(cpte) != expected_perm) { + panic("%s: perm=%llu does not match expected_perm, cpte=0x%llx, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + pte_to_xprr_perm(cpte), cpte, + (void *)start, (void *)end, new_perm, expected_perm); + } + + template = cpte; + template &= ~ARM_PTE_XPRR_MASK; + template |= xprr_perm_to_pte(new_perm); + + WRITE_PTE_STRONG(cpte_p, template); + UNLOCK_PVH(pai); + } + } else { + panic("%s: tte=0x%llx is not a table type entry, " + "start=%p, end=%p, new_perm=%u, expected_perm=%u", + __FUNCTION__, + tte, + (void *)start, (void *)end, new_perm, expected_perm); + } + + va = tte_end; + } + + PMAP_UPDATE_TLBS(pmap, start, end, false); +#endif /* (__ARM_VMSA__ == 7) */ +} + +/* + * A convenience function for setting protections on a single page. + */ +static inline void +pmap_set_xprr_perm(vm_address_t page_kva, + unsigned int expected_perm, + unsigned int new_perm) +{ + pmap_set_range_xprr_perm(page_kva, page_kva + PAGE_SIZE, expected_perm, new_perm); +} +#endif /* XNU_MONITOR */ /* TODO */ @@ -1934,6 +2579,203 @@ pmap_pages_reclaim( } } +#if XNU_MONITOR +/* + * Return a PPL page to the free list. + */ +static void +pmap_give_free_ppl_page(pmap_paddr_t paddr) +{ + assert((paddr & ARM_PGMASK) == 0); + void ** new_head = (void **)phystokv(paddr); + pmap_simple_lock(&pmap_ppl_free_page_lock); + + void * cur_head = pmap_ppl_free_page_list; + *new_head = cur_head; + pmap_ppl_free_page_list = new_head; + pmap_ppl_free_page_count++; + + pmap_simple_unlock(&pmap_ppl_free_page_lock); +} + +/* + * Get a PPL page from the free list. + */ +static pmap_paddr_t +pmap_get_free_ppl_page(void) +{ + pmap_paddr_t result = 0; + + pmap_simple_lock(&pmap_ppl_free_page_lock); + + if (pmap_ppl_free_page_list != NULL) { + void ** new_head = NULL; + new_head = *((void**)pmap_ppl_free_page_list); + result = kvtophys((vm_offset_t)pmap_ppl_free_page_list); + pmap_ppl_free_page_list = new_head; + pmap_ppl_free_page_count--; + } else { + result = 0L; + } + + pmap_simple_unlock(&pmap_ppl_free_page_lock); + assert((result & ARM_PGMASK) == 0); + + return result; +} + +/* + * pmap_mark_page_as_ppl_page claims a page on behalf of the PPL by marking it + * as PPL-owned and only allowing the PPL to write to it. + */ +MARK_AS_PMAP_TEXT static void +pmap_mark_page_as_ppl_page_internal(pmap_paddr_t pa) +{ + vm_offset_t kva = 0; + unsigned int pai = 0; + pp_attr_t attr; + + /* + * Mark each page that we allocate as belonging to the monitor, as we + * intend to use it for monitor-y stuff (page tables, table pages, that + * sort of thing). + */ + assert(!TEST_PAGE_RATIO_4); + + if (!pa_valid(pa)) { + panic("%s: bad address, " + "pa=%p", + __func__, + (void *)pa); + } + + pai = (unsigned int)pa_index(pa); + LOCK_PVH(pai); + + /* A page that the PPL already owns can't be given to the PPL. */ + if (pa_test_monitor(pa)) { + panic("%s: page already belongs to PPL, " + "pa=0x%llx", + __FUNCTION__, + pa); + } + /* The page cannot be mapped outside of the physical aperture. */ + if (!pmap_verify_free((ppnum_t)atop(pa))) { + panic("%s: page is not free, " + "pa=0x%llx", + __FUNCTION__, + pa); + } + + do { + attr = pp_attr_table[pai]; + if (attr & PP_ATTR_NO_MONITOR) { + panic("%s: page excluded from PPL, " + "pa=0x%llx", + __FUNCTION__, + pa); + } + } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_MONITOR, &pp_attr_table[pai])); + + UNLOCK_PVH(pai); + + kva = phystokv(pa); + pmap_set_xprr_perm(kva, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); + bzero((void *)(kva & ~PAGE_MASK), PAGE_SIZE); + + pmap_give_free_ppl_page(pa); +} + +static void +pmap_mark_page_as_ppl_page(pmap_paddr_t pa) +{ + pmap_mark_page_as_ppl_page_ppl(pa); +} + +static void +pmap_mark_page_as_kernel_page(pmap_paddr_t pa) +{ + vm_offset_t kva = 0; + unsigned int pai = 0; + + pai = (unsigned int)pa_index(pa); + LOCK_PVH(pai); + + if (!pa_test_monitor(pa)) { + panic("%s: page is not a PPL page, " + "pa=%p", + __FUNCTION__, + (void *)pa); + } + + pa_clear_monitor(pa); + UNLOCK_PVH(pai); + + kva = phystokv(pa); + pmap_set_xprr_perm(kva, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM); +} + +MARK_AS_PMAP_TEXT static pmap_paddr_t +pmap_release_ppl_pages_to_kernel_internal(void) +{ + pmap_paddr_t pa = 0; + + if (pmap_ppl_free_page_count <= PMAP_MIN_FREE_PPL_PAGES) { + goto done; + } + + pa = pmap_get_free_ppl_page(); + + if (!pa) { + goto done; + } + + pmap_mark_page_as_kernel_page(pa); + +done: + return pa; +} + +static uint64_t +pmap_release_ppl_pages_to_kernel(void) +{ + pmap_paddr_t pa = 0; + vm_page_t m = VM_PAGE_NULL; + vm_page_t local_freeq = VM_PAGE_NULL; + uint64_t pmap_ppl_pages_returned_to_kernel_count = 0; + + while (pmap_ppl_free_page_count > PMAP_MIN_FREE_PPL_PAGES) { + pa = pmap_release_ppl_pages_to_kernel_ppl(); + + if (!pa) { + break; + } + + /* If we retrieved a page, add it to the free queue. */ + vm_object_lock(pmap_object); + m = vm_page_lookup(pmap_object, (pa - gPhysBase)); + assert(m != VM_PAGE_NULL); + assert(VM_PAGE_WIRED(m)); + + m->vmp_busy = TRUE; + m->vmp_snext = local_freeq; + local_freeq = m; + pmap_ppl_pages_returned_to_kernel_count++; + pmap_ppl_pages_returned_to_kernel_count_total++; + + vm_object_unlock(pmap_object); + } + + if (local_freeq) { + /* We need to hold the object lock for freeing pages. */ + vm_object_lock(pmap_object); + vm_page_free_list(local_freeq, TRUE); + vm_object_unlock(pmap_object); + } + + return pmap_ppl_pages_returned_to_kernel_count; +} +#endif static kern_return_t pmap_pages_alloc( @@ -1941,6 +2783,30 @@ pmap_pages_alloc( unsigned size, unsigned option) { +#if XNU_MONITOR + if (size != PAGE_SIZE) { + panic("%s: size != PAGE_SIZE, " + "pa=%p, size=%u, option=%u", + __FUNCTION__, + pa, size, option); + } + + if (option & PMAP_PAGES_RECLAIM_NOWAIT) { + *pa = pmap_pages_reclaim(); + assert(*pa); + return KERN_SUCCESS; + } + + assert(option & PMAP_PAGES_ALLOCATE_NOWAIT); + + *pa = pmap_get_free_ppl_page(); + + if (*pa == 0) { + return KERN_RESOURCE_SHORTAGE; + } else { + return KERN_SUCCESS; + } +#else vm_page_t m = VM_PAGE_NULL, m_prev; if (option & PMAP_PAGES_RECLAIM_NOWAIT) { @@ -1979,14 +2845,114 @@ pmap_pages_alloc( m = NEXT_PAGE(m_prev); *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL; } - vm_object_unlock(pmap_object); - - OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count); - OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count); + vm_object_unlock(pmap_object); + + OSAddAtomic(size >> PAGE_SHIFT, &inuse_pmap_pages_count); + OSAddAtomic64(size >> PAGE_SHIFT, &alloc_pmap_pages_count); + + return KERN_SUCCESS; +#endif +} + +#if XNU_MONITOR +static pmap_paddr_t +pmap_alloc_page_for_kern(void) +{ + pmap_paddr_t paddr = 0; + vm_page_t m, m_prev; + + while ((m = vm_page_grab()) == VM_PAGE_NULL) { + VM_PAGE_WAIT(); + } + + vm_page_lock_queues(); + vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE); + vm_page_unlock_queues(); + + paddr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(m)); + + if (paddr == 0) { + panic("%s: paddr is 0", + __FUNCTION__); + } + + vm_object_lock(pmap_object); + + while (m != VM_PAGE_NULL) { + vm_page_insert_wired(m, pmap_object, (vm_object_offset_t) ((ptoa(VM_PAGE_GET_PHYS_PAGE(m))) - gPhysBase), VM_KERN_MEMORY_PTE); + m_prev = m; + m = NEXT_PAGE(m_prev); + *(NEXT_PAGE_PTR(m_prev)) = VM_PAGE_NULL; + } + + vm_object_unlock(pmap_object); + + OSAddAtomic(1, &inuse_pmap_pages_count); + OSAddAtomic64(1, &alloc_pmap_pages_count); + + return paddr; +} + +static void +pmap_alloc_page_for_ppl(void) +{ + pmap_mark_page_as_ppl_page(pmap_alloc_page_for_kern()); +} + +static pmap_t +pmap_alloc_pmap(void) +{ + pmap_t pmap = PMAP_NULL; + + pmap_simple_lock(&pmap_free_list_lock); + + if (pmap_free_list != PMAP_NULL) { + pmap = pmap_free_list; + pmap_free_list = *((pmap_t *)pmap); + + if (!PMAP_PTR_IS_VALID(pmap)) { + panic("%s: allocated pmap is not valid, pmap=%p", + __FUNCTION__, pmap); + } + } + + pmap_simple_unlock(&pmap_free_list_lock); + + return pmap; +} + +static void +pmap_free_pmap(pmap_t pmap) +{ + if (!PMAP_PTR_IS_VALID(pmap)) { + panic("%s: pmap is not valid, " + "pmap=%p", + __FUNCTION__, + pmap); + } - return KERN_SUCCESS; + pmap_simple_lock(&pmap_free_list_lock); + *((pmap_t *)pmap) = pmap_free_list; + pmap_free_list = pmap; + pmap_simple_unlock(&pmap_free_list_lock); } +static void +pmap_bootstrap_pmap_free_list(void) +{ + pmap_t cur_head = PMAP_NULL; + unsigned long i = 0; + + simple_lock_init(&pmap_free_list_lock, 0); + + for (i = 0; i < pmap_array_count; i++) { + *((pmap_t *)(&pmap_array[i])) = cur_head; + cur_head = &pmap_array[i]; + } + + pmap_free_list = cur_head; +} +#endif static void pmap_pages_free( @@ -2009,6 +2975,11 @@ pmap_pages_free( pmap_simple_unlock(&pmap_pages_lock); +#if XNU_MONITOR + (void)size; + + pmap_give_free_ppl_page(pa); +#else vm_page_t m; pmap_paddr_t pa_max; @@ -2024,6 +2995,7 @@ pmap_pages_free( vm_page_unlock_queues(); vm_object_unlock(pmap_object); } +#endif } static inline void @@ -2237,7 +3209,20 @@ pv_alloc( pmap_paddr_t pa; kern_return_t ret; +#if XNU_MONITOR + /* + * The PPL has no guarantee that its allocation + * will succeed, so steal pages if necessary to + * ensure that we can free up a PV allocation. + */ + ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); + + if (ret == KERN_RESOURCE_SHORTAGE) { + ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_RECLAIM_NOWAIT); + } +#else ret = pmap_pages_alloc(&pa, PAGE_SIZE, 0); +#endif if (ret != KERN_SUCCESS) { panic("%s: failed to alloc page, ret=%d, " @@ -2306,6 +3291,7 @@ static inline void PV_ALLOC(pv_entry_t **pv_ep) { assert(*pv_ep == PV_ENTRY_NULL); +#if !XNU_MONITOR if (pv_kern_free_count < pv_kern_low_water_mark) { /* * If the kernel reserved pool is low, let non-kernel mappings wait for a page @@ -2313,6 +3299,7 @@ PV_ALLOC(pv_entry_t **pv_ep) */ return; } +#endif pmap_simple_lock(&pv_free_list_lock); if ((*pv_ep = pv_free_list) != 0) { @@ -2396,7 +3383,27 @@ mapping_free_prime(void) { kern_return_t kr = KERN_FAILURE; +#if XNU_MONITOR + unsigned int i = 0; + + /* + * Allocate the needed PPL pages up front, to minimize the change that + * we will need to call into the PPL multiple times. + */ + for (i = 0; i < PV_ALLOC_INITIAL_TARGET; i += (PAGE_SIZE / sizeof(pv_entry_t))) { + pmap_alloc_page_for_ppl(); + } + + for (i = 0; i < PV_KERN_ALLOC_INITIAL_TARGET; i += (PAGE_SIZE / sizeof(pv_entry_t))) { + pmap_alloc_page_for_ppl(); + } + + while ((kr = mapping_free_prime_ppl()) == KERN_RESOURCE_SHORTAGE) { + pmap_alloc_page_for_ppl(); + } +#else kr = mapping_free_prime_internal(); +#endif if (kr != KERN_SUCCESS) { panic("%s: failed, kr=%d", @@ -2437,8 +3444,14 @@ mapping_replenish_internal(uint32_t kern_target_count, uint32_t user_target_coun pv_cnt = 0; pv_eh = pv_et = PV_ENTRY_NULL; +#if XNU_MONITOR + if ((ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT)) != KERN_SUCCESS) { + return ret; + } +#else ret = pmap_pages_alloc(&pa, PAGE_SIZE, 0); assert(ret == KERN_SUCCESS); +#endif pv_page_count++; @@ -2475,7 +3488,14 @@ mapping_replenish(void) current_thread()->options |= TH_OPT_VMPRIV; for (;;) { +#if XNU_MONITOR + + while ((kr = mapping_replenish_ppl(pv_kern_low_water_mark, pv_low_water_mark)) == KERN_RESOURCE_SHORTAGE) { + pmap_alloc_page_for_ppl(); + } +#else kr = mapping_replenish_internal(pv_kern_low_water_mark, pv_low_water_mark); +#endif if (kr != KERN_SUCCESS) { panic("%s: failed, kr=%d", __FUNCTION__, kr); @@ -2847,9 +3867,111 @@ pmap_pte( #endif +#if __APRR_SUPPORTED__ +/* + * Indicates whether the given PTE has special restrictions due to the current + * APRR settings. + */ +static boolean_t +is_pte_aprr_protected(pt_entry_t pte) +{ + uint64_t aprr_el0_value; + uint64_t aprr_el1_value; + uint64_t aprr_index; + + MRS(aprr_el0_value, APRR_EL0); + MRS(aprr_el1_value, APRR_EL1); + aprr_index = PTE_TO_APRR_INDEX(pte); + + /* Check to see if this mapping had APRR restrictions. */ + if ((APRR_EXTRACT_IDX_ATTR(aprr_el0_value, aprr_index) != APRR_EXTRACT_IDX_ATTR(APRR_EL0_RESET, aprr_index)) || + (APRR_EXTRACT_IDX_ATTR(aprr_el1_value, aprr_index) != APRR_EXTRACT_IDX_ATTR(APRR_EL1_RESET, aprr_index)) + ) { + return TRUE; + } + + return FALSE; +} +#endif /* __APRR_SUPPORTED__ */ + +#if __APRR_SUPPORTED__ +static boolean_t +is_pte_xprr_protected(pt_entry_t pte) +{ +#if __APRR_SUPPORTED__ + return is_pte_aprr_protected(pte); +#else /* __APRR_SUPPORTED__ */ +#error "XPRR configuration error" +#endif /* __APRR_SUPPORTED__ */ +} +#endif /* __APRR_SUPPORTED__*/ +#if __APRR_SUPPORTED__ +static uint64_t +__unused pte_to_xprr_perm(pt_entry_t pte) +{ +#if __APRR_SUPPORTED__ + switch (PTE_TO_APRR_INDEX(pte)) { + case APRR_FIRM_RX_INDEX: return XPRR_FIRM_RX_PERM; + case APRR_FIRM_RO_INDEX: return XPRR_FIRM_RO_PERM; + case APRR_PPL_RW_INDEX: return XPRR_PPL_RW_PERM; + case APRR_KERN_RW_INDEX: return XPRR_KERN_RW_PERM; + case APRR_FIRM_RW_INDEX: return XPRR_FIRM_RW_PERM; + case APRR_KERN0_RW_INDEX: return XPRR_KERN0_RW_PERM; + case APRR_USER_JIT_INDEX: return XPRR_USER_JIT_PERM; + case APRR_USER_RW_INDEX: return XPRR_USER_RW_PERM; + case APRR_PPL_RX_INDEX: return XPRR_PPL_RX_PERM; + case APRR_KERN_RX_INDEX: return XPRR_KERN_RX_PERM; + case APRR_PPL_RO_INDEX: return XPRR_PPL_RO_PERM; + case APRR_KERN_RO_INDEX: return XPRR_KERN_RO_PERM; + case APRR_KERN0_RX_INDEX: return XPRR_KERN0_RO_PERM; + case APRR_KERN0_RO_INDEX: return XPRR_KERN0_RO_PERM; + case APRR_USER_RX_INDEX: return XPRR_USER_RX_PERM; + case APRR_USER_RO_INDEX: return XPRR_USER_RO_PERM; + default: return XPRR_MAX_PERM; + } +#else +#error "XPRR configuration error" +#endif /**/ +} +#if __APRR_SUPPORTED__ +static uint64_t +xprr_perm_to_aprr_index(uint64_t perm) +{ + switch (perm) { + case XPRR_FIRM_RX_PERM: return APRR_FIRM_RX_INDEX; + case XPRR_FIRM_RO_PERM: return APRR_FIRM_RO_INDEX; + case XPRR_PPL_RW_PERM: return APRR_PPL_RW_INDEX; + case XPRR_KERN_RW_PERM: return APRR_KERN_RW_INDEX; + case XPRR_FIRM_RW_PERM: return APRR_FIRM_RW_INDEX; + case XPRR_KERN0_RW_PERM: return APRR_KERN0_RW_INDEX; + case XPRR_USER_JIT_PERM: return APRR_USER_JIT_INDEX; + case XPRR_USER_RW_PERM: return APRR_USER_RW_INDEX; + case XPRR_PPL_RX_PERM: return APRR_PPL_RX_INDEX; + case XPRR_KERN_RX_PERM: return APRR_KERN_RX_INDEX; + case XPRR_PPL_RO_PERM: return APRR_PPL_RO_INDEX; + case XPRR_KERN_RO_PERM: return APRR_KERN_RO_INDEX; + case XPRR_KERN0_RX_PERM: return APRR_KERN0_RO_INDEX; + case XPRR_KERN0_RO_PERM: return APRR_KERN0_RO_INDEX; + case XPRR_USER_RX_PERM: return APRR_USER_RX_INDEX; + case XPRR_USER_RO_PERM: return APRR_USER_RO_INDEX; + default: return APRR_MAX_INDEX; + } +} +#endif /* __APRR_SUPPORTED__ */ + +static pt_entry_t +__unused xprr_perm_to_pte(uint64_t perm) +{ +#if __APRR_SUPPORTED__ + return APRR_INDEX_TO_PTE(xprr_perm_to_aprr_index(perm)); +#else +#error "XPRR configuration error" +#endif /**/ +} +#endif /* __APRR_SUPPORTED__*/ /* @@ -3279,6 +4401,30 @@ pmap_bootstrap( lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL); +#if XNU_MONITOR + +#if DEVELOPMENT || DEBUG + PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable)); +#endif + + simple_lock_init(&pmap_ppl_free_page_lock, 0); + +#if __APRR_SUPPORTED__ + if (((uintptr_t)(&ppl_trampoline_start)) % PAGE_SIZE) { + panic("%s: ppl_trampoline_start is not page aligned, " + "vstart=%#lx", + __FUNCTION__, + vstart); + } + + if (((uintptr_t)(&ppl_trampoline_end)) % PAGE_SIZE) { + panic("%s: ppl_trampoline_end is not page aligned, " + "vstart=%#lx", + __FUNCTION__, + vstart); + } +#endif /* __APRR_SUPPORTED__ */ +#endif /* XNU_MONITOR */ #if DEVELOPMENT || DEBUG if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) { @@ -3353,6 +4499,28 @@ pmap_bootstrap( pmap_load_io_rgns(); ptd_bootstrap(ptd_root_table, (unsigned int)(ptd_root_table_size / sizeof(pt_desc_t))); +#if XNU_MONITOR + pmap_array_begin = (void *)phystokv(avail_start); + pmap_array = pmap_array_begin; + avail_start += round_page(MAX_ASID * sizeof(struct pmap)); + pmap_array_end = (void *)phystokv(avail_start); + + pmap_array_count = ((pmap_array_end - pmap_array_begin) / sizeof(struct pmap)); + + pmap_bootstrap_pmap_free_list(); + + pmap_ledger_ptr_array_begin = (void *)phystokv(avail_start); + pmap_ledger_ptr_array = pmap_ledger_ptr_array_begin; + avail_start += round_page(MAX_PMAP_LEDGERS * sizeof(void*)); + pmap_ledger_ptr_array_end = (void *)phystokv(avail_start); + + pmap_ledger_refcnt_begin = (void *)phystokv(avail_start); + pmap_ledger_refcnt = pmap_ledger_refcnt_begin; + avail_start += round_page(MAX_PMAP_LEDGERS * sizeof(os_refcnt_t)); + pmap_ledger_refcnt_end = (void *)phystokv(avail_start); + + simple_lock_init(&pmap_ledger_lock, 0); +#endif pmap_cpu_data_array_init(); vm_first_phys = gPhysBase; @@ -3430,6 +4598,135 @@ pmap_bootstrap( #endif /* KASAN */ } +#if XNU_MONITOR + +static inline void +pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa) +{ + pmap_paddr_t cur_pa; + for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) { + assert(pa_valid(cur_pa)); + pa_set_monitor(cur_pa); + } +} + +static void +pa_set_range_xprr_perm(pmap_paddr_t start_pa, + pmap_paddr_t end_pa, + unsigned int expected_perm, + unsigned int new_perm) +{ + vm_offset_t start_va = phystokv(start_pa); + vm_offset_t end_va = start_va + (end_pa - start_pa); + + pa_set_range_monitor(start_pa, end_pa); + pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm); +} + +void +pmap_static_allocations_done(void) +{ + pmap_paddr_t monitor_start_pa; + pmap_paddr_t monitor_end_pa; + + /* + * We allocate memory for bootstrap starting at topOfKernelData (which + * is at the end of the device tree and ramdisk data, if applicable). + * We use avail_start as a pointer to the first address that has not + * been reserved for bootstrap, so we know which pages to give to the + * virtual memory layer. + * + * These bootstrap allocations will be used primarily for page tables. + * If we wish to secure the page tables, we need to start by marking + * these bootstrap allocations as pages that we want to protect. + */ + monitor_start_pa = BootArgs->topOfKernelData; + monitor_end_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE; + + /* The bootstrap page tables are mapped RO at boostrap. */ + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM); + + monitor_start_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE; + monitor_end_pa = avail_start; + + /* The other bootstrap allocations are mapped RW at bootstrap. */ + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); + + /* The RO page tables are mapped RW at bootstrap. */ + monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin); + monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin); + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); + + monitor_start_pa = kvtophys(segPPLDATAB); + monitor_end_pa = monitor_start_pa + segSizePPLDATA; + + /* PPL data is RW for the PPL, RO for the kernel. */ + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); + + monitor_start_pa = kvtophys(segPPLTEXTB); + monitor_end_pa = monitor_start_pa + segSizePPLTEXT; + + /* PPL text is RX for the PPL, RO for the kernel. */ + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM); + +#if __APRR_SUPPORTED__ + monitor_start_pa = kvtophys(segPPLTRAMPB); + monitor_end_pa = monitor_start_pa + segSizePPLTRAMP; + + /* + * The PPLTRAMP pages will be a mix of PPL RX/kernel RO and + * PPL RX/kernel RX. However, all of these pages belong to the PPL. + */ + pa_set_range_monitor(monitor_start_pa, monitor_end_pa); +#endif + + /* + * In order to support DTrace, the save areas for the PPL must be + * writable. This is due to the fact that DTrace will try to update + * register state. + */ + if (pmap_ppl_disable) { + vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start); + vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start); + + pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM); + } + +#if __APRR_SUPPORTED__ + /* The trampoline must also be specially protected. */ + pmap_set_range_xprr_perm((vm_offset_t)&ppl_trampoline_start, (vm_offset_t)&ppl_trampoline_end, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM); +#endif + + if (segSizePPLDATACONST > 0) { + monitor_start_pa = kvtophys(segPPLDATACONSTB); + monitor_end_pa = monitor_start_pa + segSizePPLDATACONST; + + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM); + } + + /* + * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security + * precaution. The real RW mappings are at a different location with guard pages. + */ + pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_PPL_RO_PERM); +} + + +void +pmap_lockdown_ppl(void) +{ + /* Mark the PPL as being locked down. */ + +#if __APRR_SUPPORTED__ + pmap_ppl_locked_down = TRUE; + /* Force a trap into to the PPL to update APRR_EL1. */ + pmap_return(FALSE, FALSE); +#else +#error "XPRR configuration error" +#endif /* __APRR_SUPPORTED__ */ + +} +#endif /* XNU_MONITOR */ void pmap_virtual_space( @@ -3644,6 +4941,151 @@ pmap_zone_init( PAGE_SIZE, "pmap"); } +#if XNU_MONITOR +MARK_AS_PMAP_TEXT static void +pmap_ledger_alloc_init_internal(size_t size) +{ + pmap_simple_lock(&pmap_ledger_lock); + + if (pmap_ledger_alloc_initialized) { + panic("%s: already initialized, " + "size=%lu", + __func__, + size); + } + + if (size != sizeof(pmap_ledger_data_t)) { + panic("%s: size mismatch, expected %lu, " + "size=%lu", + __func__, PMAP_LEDGER_DATA_BYTES, + size); + } + + pmap_ledger_alloc_initialized = true; + + pmap_simple_unlock(&pmap_ledger_lock); +} + +MARK_AS_PMAP_TEXT static ledger_t +pmap_ledger_alloc_internal(void) +{ + pmap_paddr_t paddr; + uint64_t vaddr, vstart, vend; + uint64_t index; + + ledger_t new_ledger; + uint64_t array_index; + + pmap_simple_lock(&pmap_ledger_lock); + if (pmap_ledger_free_list == NULL) { + paddr = pmap_get_free_ppl_page(); + + if (paddr == 0) { + pmap_simple_unlock(&pmap_ledger_lock); + return NULL; + } + + vstart = phystokv(paddr); + vend = vstart + PAGE_SIZE; + + for (vaddr = vstart; (vaddr < vend) && ((vaddr + sizeof(pmap_ledger_t)) <= vend); vaddr += sizeof(pmap_ledger_t)) { + pmap_ledger_t *free_ledger; + + index = pmap_ledger_ptr_array_free_index++; + + if (index >= MAX_PMAP_LEDGERS) { + panic("%s: pmap_ledger_ptr_array is full, index=%llu", + __func__, index); + } + + free_ledger = (pmap_ledger_t*)vaddr; + + pmap_ledger_ptr_array[index] = free_ledger; + free_ledger->back_ptr = &pmap_ledger_ptr_array[index]; + + free_ledger->next = pmap_ledger_free_list; + pmap_ledger_free_list = free_ledger; + } + + pa_set_range_xprr_perm(paddr, paddr + PAGE_SIZE, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM); + } + + new_ledger = (ledger_t)pmap_ledger_free_list; + pmap_ledger_free_list = pmap_ledger_free_list->next; + + array_index = pmap_ledger_validate(new_ledger); + os_ref_init(&pmap_ledger_refcnt[array_index], NULL); + + pmap_simple_unlock(&pmap_ledger_lock); + + return new_ledger; +} + +MARK_AS_PMAP_TEXT static void +pmap_ledger_free_internal(ledger_t ledger) +{ + pmap_ledger_t* free_ledger; + + free_ledger = (pmap_ledger_t*)ledger; + + pmap_simple_lock(&pmap_ledger_lock); + uint64_t array_index = pmap_ledger_validate(ledger); + + if (os_ref_release(&pmap_ledger_refcnt[array_index]) != 0) { + panic("%s: ledger still referenced, " + "ledger=%p", + __func__, + ledger); + } + + free_ledger->next = pmap_ledger_free_list; + pmap_ledger_free_list = free_ledger; + pmap_simple_unlock(&pmap_ledger_lock); +} + + +static void +pmap_ledger_retain(ledger_t ledger) +{ + pmap_simple_lock(&pmap_ledger_lock); + uint64_t array_index = pmap_ledger_validate(ledger); + os_ref_retain(&pmap_ledger_refcnt[array_index]); + pmap_simple_unlock(&pmap_ledger_lock); +} + +static void +pmap_ledger_release(ledger_t ledger) +{ + pmap_simple_lock(&pmap_ledger_lock); + uint64_t array_index = pmap_ledger_validate(ledger); + os_ref_release_live(&pmap_ledger_refcnt[array_index]); + pmap_simple_unlock(&pmap_ledger_lock); +} + +void +pmap_ledger_alloc_init(size_t size) +{ + pmap_ledger_alloc_init_ppl(size); +} + +ledger_t +pmap_ledger_alloc(void) +{ + ledger_t retval = NULL; + + while ((retval = pmap_ledger_alloc_ppl()) == NULL) { + pmap_alloc_page_for_ppl(); + } + + return retval; +} + +void +pmap_ledger_free(ledger_t ledger) +{ + pmap_ledger_free_ppl(ledger); +} +#else /* XNU_MONITOR */ __dead2 void pmap_ledger_alloc_init(size_t size) @@ -3669,6 +5111,7 @@ pmap_ledger_free(ledger_t ledger) "ledger=%p", __func__, ledger); } +#endif /* XNU_MONITOR */ /* * Create and return a physical map. @@ -3703,6 +5146,11 @@ pmap_create_options_internal( return PMAP_NULL; } +#if XNU_MONITOR + if ((p = pmap_alloc_pmap()) == PMAP_NULL) { + return PMAP_NULL; + } +#else /* * Allocate a pmap struct from the pmap_zone. Then allocate * the translation table of the right size for the pmap. @@ -3710,6 +5158,7 @@ pmap_create_options_internal( if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) { return PMAP_NULL; } +#endif if (flags & PMAP_CREATE_64BIT) { p->min = MACH_VM_MIN_ADDRESS; @@ -3743,6 +5192,12 @@ pmap_create_options_internal( } +#if XNU_MONITOR + if (ledger) { + pmap_ledger_validate(ledger); + pmap_ledger_retain(ledger); + } +#endif /* XNU_MONITOR */ p->ledger = ledger; @@ -3756,7 +5211,11 @@ pmap_create_options_internal( p->tte_index_max = tte_index_max; #endif +#if XNU_MONITOR + p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, PMAP_TT_ALLOCATE_NOWAIT); +#else p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0); +#endif if (!(p->tte)) { goto tt1_alloc_fail; } @@ -3803,7 +5262,15 @@ pmap_create_options_internal( tt1_alloc_fail: pmap_get_pt_ops(p)->free_id(p); id_alloc_fail: +#if XNU_MONITOR + pmap_free_pmap(p); + + if (ledger) { + pmap_ledger_release(ledger); + } +#else zfree(pmap_zone, p); +#endif return PMAP_NULL; } @@ -3819,7 +5286,17 @@ pmap_create_options( ledger_reference(ledger); +#if XNU_MONITOR + /* + * TODO: It should be valid for pmap_create_options_internal to fail; we could + * be out of ASIDs. + */ + while ((pmap = pmap_create_options_ppl(ledger, size, flags)) == PMAP_NULL) { + pmap_alloc_page_for_ppl(); + } +#else pmap = pmap_create_options_internal(ledger, size, flags); +#endif if (pmap == PMAP_NULL) { ledger_dereference(ledger); @@ -3830,7 +5307,13 @@ pmap_create_options( return pmap; } -#if MACH_ASSERT +#if XNU_MONITOR +/* + * This symbol remains in place when the PPL is enabled so that the dispatch + * table does not change from development to release configurations. + */ +#endif +#if MACH_ASSERT || XNU_MONITOR MARK_AS_PMAP_TEXT static void pmap_set_process_internal( __unused pmap_t pmap, @@ -3874,7 +5357,7 @@ pmap_set_process_internal( } #endif /* MACH_ASSERT */ } -#endif /* MACH_ASSERT*/ +#endif /* MACH_ASSERT || XNU_MONITOR */ #if MACH_ASSERT void @@ -3883,7 +5366,11 @@ pmap_set_process( int pid, char *procname) { +#if XNU_MONITOR + pmap_set_process_ppl(pmap, pid, procname); +#else pmap_set_process_internal(pmap, pid, procname); +#endif } #endif /* MACH_ASSERT */ @@ -3999,10 +5486,22 @@ pmap_destroy_internal( pmap_check_ledgers(pmap); if (pmap->nested_region_asid_bitmap) { +#if XNU_MONITOR + pmap_pages_free(kvtophys((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE); +#else kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size * sizeof(unsigned int)); +#endif } +#if XNU_MONITOR + if (pmap->ledger) { + pmap_ledger_release(pmap->ledger); + } + + pmap_free_pmap(pmap); +#else zfree(pmap_zone, pmap); +#endif } void @@ -4015,7 +5514,13 @@ pmap_destroy( ledger = pmap->ledger; +#if XNU_MONITOR + pmap_destroy_ppl(pmap); + + pmap_check_ledger_fields(ledger); +#else pmap_destroy_internal(pmap); +#endif ledger_dereference(ledger); @@ -4040,7 +5545,11 @@ void pmap_reference( pmap_t pmap) { +#if XNU_MONITOR + pmap_reference_ppl(pmap); +#else pmap_reference_internal(pmap); +#endif } static tt_entry_t * @@ -4084,6 +5593,9 @@ pmap_tt1_allocate( return (tt_entry_t *)0; } +#if XNU_MONITOR + assert(pa); +#endif if (size < PAGE_SIZE) { va = phystokv(pa) + size; @@ -4263,6 +5775,9 @@ pmap_tt_allocate( *ttp = (tt_entry_t *)phystokv(pa); } +#if XNU_MONITOR + assert(*ttp); +#endif return KERN_SUCCESS; } @@ -4525,6 +6040,11 @@ pmap_remove_pv( pv_h = pai_to_pvh(pai); vm_offset_t pvh_flags = pvh_get_flags(pv_h); +#if XNU_MONITOR + if (pvh_flags & PVH_FLAG_LOCKDOWN) { + panic("%d is locked down (%#lx), cannot remove", pai, pvh_flags); + } +#endif if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0)) { @@ -4685,6 +6205,14 @@ pmap_remove_range_options( //assert(!ARM_PTE_IS_COMPRESSED(spte)); pa = pte_to_pa(spte); if (!pa_valid(pa)) { +#if XNU_MONITOR || HAS_MILD_DSB + unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa)); +#endif +#if XNU_MONITOR + if (!pmap_ppl_disable && (cacheattr & PP_ATTR_MONITOR)) { + panic("%s: attempt to remove mapping of PPL-protected I/O address 0x%llx", __func__, (uint64_t)pa); + } +#endif break; } pai = (int)pa_index(pa); @@ -4985,7 +6513,13 @@ pmap_remove_options( l = end; } +#if XNU_MONITOR + remove_count += pmap_remove_options_ppl(pmap, va, l, options); + + pmap_ledger_check_balance(pmap); +#else remove_count += pmap_remove_options_internal(pmap, va, l, options); +#endif va = l; } @@ -5099,7 +6633,11 @@ pmap_switch( pmap_t pmap) { PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); +#if XNU_MONITOR + pmap_switch_ppl(pmap); +#else pmap_switch_internal(pmap); +#endif PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END); } @@ -5167,6 +6705,11 @@ pmap_page_protect_options_internal( pv_h = pai_to_pvh(pai); pvh_flags = pvh_get_flags(pv_h); +#if XNU_MONITOR + if (remove && (pvh_flags & PVH_FLAG_LOCKDOWN)) { + panic("%d is locked down (%#llx), cannot remove", pai, pvh_get_flags(pv_h)); + } +#endif pte_p = PT_ENTRY_NULL; pve_p = PV_ENTRY_NULL; @@ -5194,6 +6737,12 @@ pmap_page_protect_options_internal( #ifdef PVH_FLAG_IOMMU if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) { +#if XNU_MONITOR + if (pvh_flags & PVH_FLAG_LOCKDOWN) { + panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu 0x%llx, pve_p=%p", + ppnum, (uint64_t)pte_p & ~PVH_FLAG_IOMMU, pve_p); + } +#endif if (remove) { if (options & PMAP_OPTIONS_COMPRESSOR) { panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu 0x%llx, pve_p=%p", @@ -5397,6 +6946,17 @@ pmap_page_protect_options_internal( tmplate |= pt_attr_leaf_xn(pt_attr); } +#if __APRR_SUPPORTED__ + if (__improbable(is_pte_xprr_protected(spte))) { + panic("pmap_page_protect: modifying an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx ppnum: 0x%x", + pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)spte, (uint64_t)tmplate, (uint64_t)va, ppnum); + } + + if (__improbable(is_pte_xprr_protected(tmplate))) { + panic("pmap_page_protect: creating an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx ppnum: 0x%x", + pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)spte, (uint64_t)tmplate, (uint64_t)va, ppnum); + } +#endif /* __APRR_SUPPORTED__*/ if (*pte_p != ARM_PTE_TYPE_FAULT && !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) && @@ -5480,7 +7040,11 @@ pmap_page_protect_options( PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot); +#if XNU_MONITOR + pmap_page_protect_options_ppl(ppnum, prot, options); +#else pmap_page_protect_options_internal(ppnum, prot, options); +#endif PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END); } @@ -5729,6 +7293,18 @@ pmap_protect_options_internal( /* We do not expect to write fast fault the entry. */ pte_set_was_writeable(tmplate, false); +#if __APRR_SUPPORTED__ + if (__improbable(is_pte_xprr_protected(spte) && (pte_to_xprr_perm(spte) != XPRR_USER_JIT_PERM))) { + /* Only test for PPL protection here, User-JIT mappings may be mutated by this function. */ + panic("%s: modifying a PPL mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx", + __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate); + } + + if (__improbable(is_pte_xprr_protected(tmplate))) { + panic("%s: creating an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx", + __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate); + } +#endif /* __APRR_SUPPORTED__*/ WRITE_PTE_FAST(pte_p, tmplate); if (managed) { @@ -5798,7 +7374,11 @@ pmap_protect_options( l = e; } +#if XNU_MONITOR + pmap_protect_options_ppl(pmap, beg, l, prot, options, args); +#else pmap_protect_options_internal(pmap, beg, l, prot, options, args); +#endif beg = l; } @@ -5979,6 +7559,11 @@ pmap_enter_pv( vm_offset_t pvh_flags = pvh_get_flags(pv_h); +#if XNU_MONITOR + if (pvh_flags & PVH_FLAG_LOCKDOWN) { + panic("%d is locked down (%#lx), cannot enter", pai, pvh_flags); + } +#endif #ifdef PVH_FLAG_CPU /* An IOMMU mapping may already be present for a page that hasn't yet @@ -6384,6 +7969,22 @@ pmap_enter_options_internal( pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits); +#if XNU_MONITOR + /* The regular old kernel is not allowed to remap PPL pages. */ + if (pa_test_monitor(pa)) { + panic("%s: page belongs to PPL, " + "pmap=%p, v=0x%llx, pn=%u, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x", + __FUNCTION__, + pmap, v, pn, prot, fault_type, flags, wired, options); + } + + if (pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN) { + panic("%s: page locked down, " + "pmap=%p, v=0x%llx, pn=%u, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x", + __FUNCTION__, + pmap, v, pn, prot, fault_type, flags, wired, options); + } +#endif if (pte == *pte_p) { @@ -6483,6 +8084,22 @@ pmap_enter_options_internal( pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits); +#if XNU_MONITOR + if (!pmap_ppl_disable && (wimg_bits & PP_ATTR_MONITOR)) { + uint64_t xprr_perm = pte_to_xprr_perm(pte); + pte &= ~ARM_PTE_XPRR_MASK; + switch (xprr_perm) { + case XPRR_KERN_RO_PERM: + pte |= xprr_perm_to_pte(XPRR_PPL_RO_PERM); + break; + case XPRR_KERN_RW_PERM: + pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM); + break; + default: + panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte); + } + } +#endif pmap_enter_pte(pmap, pte_p, pte, v); } @@ -6538,7 +8155,25 @@ pmap_enter_options( PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pn, prot); +#if XNU_MONITOR + if (options & PMAP_OPTIONS_NOWAIT) { + /* If NOWAIT was requested, just return the result. */ + kr = pmap_enter_options_ppl(pmap, v, pn, prot, fault_type, flags, wired, options); + } else { + /* + * If NOWAIT was not requested, loop until the enter does not + * fail due to lack of resources. + */ + while ((kr = pmap_enter_options_ppl(pmap, v, pn, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT)) == KERN_RESOURCE_SHORTAGE) { + pv_water_mark_check(); + pmap_alloc_page_for_ppl(); + } + } + + pmap_ledger_check_balance(pmap); +#else kr = pmap_enter_options_internal(pmap, v, pn, prot, fault_type, flags, wired, options); +#endif pv_water_mark_check(); PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr); @@ -6614,7 +8249,13 @@ pmap_change_wiring( vm_map_address_t v, boolean_t wired) { +#if XNU_MONITOR + pmap_change_wiring_ppl(pmap, v, wired); + + pmap_ledger_check_balance(pmap); +#else pmap_change_wiring_internal(pmap, v, wired); +#endif } MARK_AS_PMAP_TEXT static ppnum_t @@ -6657,7 +8298,11 @@ pmap_find_phys( } if (not_in_kdp) { +#if XNU_MONITOR + return pmap_find_phys_ppl(pmap, va); +#else return pmap_find_phys_internal(pmap, va); +#endif } else { return pmap_vtophys(pmap, va); } @@ -6804,7 +8449,11 @@ pmap_extract( return pa; } +#if XNU_MONITOR + return pmap_extract_ppl(pmap, va); +#else return pmap_extract_internal(pmap, va); +#endif } /* @@ -6986,7 +8635,14 @@ pmap_expand( if (options & PMAP_OPTIONS_NOWAIT) { return KERN_RESOURCE_SHORTAGE; } +#if XNU_MONITOR + panic("%s: failed to allocate tt, " + "pmap=%p, v=%p, options=0x%x, level=%u", + __FUNCTION__, + pmap, (void *)v, options, level); +#else VM_PAGE_WAIT(); +#endif } PMAP_LOCK(pmap); if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) { @@ -7051,6 +8707,13 @@ void pmap_gc( void) { +#if XNU_MONITOR + /* + * We cannot invoke the scheduler from the PPL, so for now we elide the + * GC logic if the PPL is enabled. + */ +#endif +#if !XNU_MONITOR pmap_t pmap, pmap_next; boolean_t gc_wait; @@ -7085,6 +8748,7 @@ pmap_gc( } pmap_simple_unlock(&pmaps_lock); } +#endif } /* @@ -7093,7 +8757,11 @@ pmap_gc( uint64_t pmap_release_pages_fast(void) { +#if XNU_MONITOR + return pmap_release_ppl_pages_to_kernel(); +#else /* XNU_MONITOR */ return 0; +#endif } /* @@ -7227,6 +8895,14 @@ phys_attribute_clear_internal( pmap_paddr_t pa = ptoa(pn); vm_prot_t allow_mode = VM_PROT_ALL; +#if XNU_MONITOR + if (bits & PP_ATTR_PPL_OWNED_BITS) { + panic("%s: illegal request, " + "pn=%u, bits=%#x, options=%#x, arg=%p", + __FUNCTION__, + pn, bits, options, arg); + } +#endif if ((bits & PP_ATTR_MODIFIED) && (options & PMAP_OPTIONS_NOFLUSH) && @@ -7288,7 +8964,11 @@ phys_attribute_clear( */ PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits); +#if XNU_MONITOR + phys_attribute_clear_ppl(pn, bits, options, arg); +#else phys_attribute_clear_internal(pn, bits, options, arg); +#endif PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END); } @@ -7308,6 +8988,14 @@ phys_attribute_set_internal( pmap_paddr_t pa = ptoa(pn); assert(pn != vm_page_fictitious_addr); +#if XNU_MONITOR + if (bits & PP_ATTR_PPL_OWNED_BITS) { + panic("%s: illegal request, " + "pn=%u, bits=%#x", + __FUNCTION__, + pn, bits); + } +#endif pa_set_bits(pa, bits); @@ -7319,7 +9007,11 @@ phys_attribute_set( ppnum_t pn, unsigned int bits) { +#if XNU_MONITOR + phys_attribute_set_ppl(pn, bits); +#else phys_attribute_set_internal(pn, bits); +#endif } @@ -7572,10 +9264,19 @@ pmap_clear_noencrypt( #endif } +#if XNU_MONITOR +boolean_t +pmap_is_monitor(ppnum_t pn) +{ + assert(pa_valid(ptoa(pn))); + return phys_attribute_test(pn, PP_ATTR_MONITOR); +} +#endif void pmap_lock_phys_page(ppnum_t pn) { +#if !XNU_MONITOR int pai; pmap_paddr_t phys = ptoa(pn); @@ -7583,6 +9284,9 @@ pmap_lock_phys_page(ppnum_t pn) pai = (int)pa_index(phys); LOCK_PVH(pai); } else +#else + (void)pn; +#endif { simple_lock(&phys_backup_lock, LCK_GRP_NULL);} } @@ -7590,6 +9294,7 @@ pmap_lock_phys_page(ppnum_t pn) void pmap_unlock_phys_page(ppnum_t pn) { +#if !XNU_MONITOR int pai; pmap_paddr_t phys = ptoa(pn); @@ -7597,6 +9302,9 @@ pmap_unlock_phys_page(ppnum_t pn) pai = (int)pa_index(phys); UNLOCK_PVH(pai); } else +#else + (void)pn; +#endif { simple_unlock(&phys_backup_lock);} } @@ -7683,7 +9391,11 @@ pmap_switch_user_ttb( pmap_t pmap) { PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid); +#if XNU_MONITOR + pmap_switch_user_ttb_ppl(pmap); +#else pmap_switch_user_ttb_internal(pmap); +#endif PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END); } @@ -7700,7 +9412,11 @@ pmap_clear_user_ttb_internal(void) void pmap_clear_user_ttb(void) { +#if XNU_MONITOR + pmap_clear_user_ttb_ppl(); +#else pmap_clear_user_ttb_internal(); +#endif } /* @@ -7817,6 +9533,16 @@ arm_force_fast_fault_internal( } } +#if MACH_ASSERT && XNU_MONITOR + if (is_pte_xprr_protected(spte)) { + if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) { + panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, " + "ppnum=0x%x, options=0x%x, allow_mode=0x%x", + __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va, + ppnum, options, allow_mode); + } + } +#endif /* MACH_ASSERT && XNU_MONITOR */ if (update_pte) { if (*pte_p != ARM_PTE_TYPE_FAULT && @@ -7928,7 +9654,11 @@ arm_force_fast_fault( return FALSE; /* Not a managed page. */ } +#if XNU_MONITOR + return arm_force_fast_fault_ppl(ppnum, allow_mode, options); +#else return arm_force_fast_fault_internal(ppnum, allow_mode, options); +#endif } /* @@ -8021,6 +9751,16 @@ arm_clear_fast_fault( } } +#if MACH_ASSERT && XNU_MONITOR + if (is_pte_xprr_protected(spte)) { + if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) { + panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, " + "ppnum=0x%x, fault_type=0x%x", + __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va, + ppnum, fault_type); + } + } +#endif /* MACH_ASSERT && XNU_MONITOR */ if (spte != tmplate) { if (spte != ARM_PTE_TYPE_FAULT) { @@ -8099,17 +9839,51 @@ arm_fast_fault_internal( if (!pa_valid(pa)) { PMAP_UNLOCK(pmap); +#if XNU_MONITOR + if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) { + return KERN_PROTECTION_FAILURE; + } else +#endif return result; } pai = (int)pa_index(pa); LOCK_PVH(pai); +#if __APRR_SUPPORTED__ + if (*ptep == spte) { + /* + * Double-check the spte value, as we care + * about the AF bit. + */ + break; + } + UNLOCK_PVH(pai); +#else /* !(__APRR_SUPPORTED__*/ break; +#endif /* !(__APRR_SUPPORTED__*/ } } else { PMAP_UNLOCK(pmap); return result; } +#if __APRR_SUPPORTED__ + /* Check to see if this mapping had APRR restrictions. */ + if (is_pte_xprr_protected(spte)) { + /* + * We have faulted on an XPRR managed mapping; decide if the access should be + * reattempted or if it should cause an exception. Now that all JIT entitled + * task threads always have MPRR enabled we're only here because of + * an AF fault or an actual permission fault. AF faults will have result + * changed to KERN_SUCCESS below upon arm_clear_fast_fault return. + */ + if (was_af_fault && (spte & ARM_PTE_AF)) { + result = KERN_SUCCESS; + goto out; + } else { + result = KERN_PROTECTION_FAILURE; + } + } +#endif /* __APRR_SUPPORTED__*/ if ((IS_REFFAULT_PAGE(pai)) || ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai))) { @@ -8140,6 +9914,9 @@ arm_fast_fault_internal( } } +#if __APRR_SUPPORTED__ +out: +#endif /* __APRR_SUPPORTED__*/ UNLOCK_PVH(pai); PMAP_UNLOCK(pmap); return result; @@ -8182,7 +9959,11 @@ arm_fast_fault( } #endif +#if XNU_MONITOR + result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user); +#else result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user); +#endif #if (__ARM_VMSA__ == 7) done: @@ -8304,7 +10085,27 @@ pmap_map_cpu_windows_copy_internal( vm_offset_t cpu_copywindow_vaddr = 0; bool need_strong_sync = false; +#if XNU_MONITOR || HAS_MILD_DSB + unsigned int cacheattr = (!pa_valid(ptoa(pn)) ? pmap_cache_attributes(pn) : 0); + need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0); +#endif + +#if XNU_MONITOR +#ifdef __ARM_COHERENT_IO__ + if (pa_valid(ptoa(pn)) && !pmap_ppl_disable) { + panic("%s: attempted to map a managed page, " + "pn=%u, prot=0x%x, wimg_bits=0x%x", + __FUNCTION__, + pn, prot, wimg_bits); + } + if (!pmap_ppl_disable && (cacheattr & PP_ATTR_MONITOR)) { + panic("%s: attempt to map PPL-protected I/O address 0x%llx", __func__, (uint64_t)ptoa(pn)); + } +#else /* __ARM_COHERENT_IO__ */ +#error CPU copy windows are not properly supported with both the PPL and incoherent IO +#endif /* __ARM_COHERENT_IO__ */ +#endif /* XNU_MONITOR */ cpu_num = pmap_cpu_data->cpu_number; for (i = 0; i < CPUWINDOWS_MAX; i++) { @@ -8350,7 +10151,11 @@ pmap_map_cpu_windows_copy( vm_prot_t prot, unsigned int wimg_bits) { +#if XNU_MONITOR + return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits); +#else return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits); +#endif } MARK_AS_PMAP_TEXT static void @@ -8378,7 +10183,11 @@ void pmap_unmap_cpu_windows_copy( unsigned int index) { +#if XNU_MONITOR + return pmap_unmap_cpu_windows_copy_ppl(index); +#else return pmap_unmap_cpu_windows_copy_internal(index); +#endif } /* @@ -8398,7 +10207,11 @@ void pmap_set_nested( pmap_t pmap) { +#if XNU_MONITOR + pmap_set_nested_ppl(pmap); +#else pmap_set_nested_internal(pmap); +#endif } /* @@ -8727,9 +10540,72 @@ pmap_trim( addr64_t nstart, uint64_t size) { +#if XNU_MONITOR + pmap_trim_ppl(grand, subord, vstart, nstart, size); + + pmap_ledger_check_balance(grand); + pmap_ledger_check_balance(subord); +#else pmap_trim_internal(grand, subord, vstart, nstart, size); +#endif } +#if HAS_APPLE_PAC && XNU_MONITOR +static void * +pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator) +{ + void *res = NULL; + boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE); + + ml_set_kernelkey_enabled(FALSE); + switch (key) { + case ptrauth_key_asia: + res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator); + break; + case ptrauth_key_asda: + res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator); + break; + default: + panic("attempt to sign user pointer without process independent key"); + } + ml_set_kernelkey_enabled(TRUE); + + ml_set_interrupts_enabled(current_intr_state); + + return res; +} + +void * +pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator) +{ + return pmap_sign_user_ptr_internal(value, key, discriminator); +} + +static void * +pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator) +{ + if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) { + panic("attempt to auth user pointer without process independent key"); + } + + void *res = NULL; + boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE); + + ml_set_kernelkey_enabled(FALSE); + res = ml_auth_ptr_unchecked(value, key, discriminator); + ml_set_kernelkey_enabled(TRUE); + + ml_set_interrupts_enabled(current_intr_state); + + return res; +} + +void * +pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator) +{ + return pmap_auth_user_ptr_internal(value, key, discriminator); +} +#endif /* HAS_APPLE_PAC && XNU_MONITOR */ /* * kern_return_t pmap_nest(grand, subord, vstart, size) @@ -8776,6 +10652,9 @@ pmap_nest_internal( __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); assert(pmap_get_pt_attr(subord) == pt_attr); +#if XNU_MONITOR + expand_options |= PMAP_TT_ALLOCATE_NOWAIT; +#endif if (((size | vstart | nstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL) { panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size); @@ -8792,7 +10671,29 @@ pmap_nest_internal( if (subord->nested_region_asid_bitmap == NULL) { nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY); +#if XNU_MONITOR + pmap_paddr_t pa = 0; + + if ((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE) { + panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, " + "grand=%p, subord=%p, vstart=0x%llx, nstart=0x%llx, size=%llx", + __FUNCTION__, + nested_region_asid_bitmap_size, + grand, subord, vstart, nstart, size); + } + + kr = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); + + if (kr != KERN_SUCCESS) { + return kr; + } + + assert(pa); + + nested_region_asid_bitmap = (unsigned int *)phystokv(pa); +#else nested_region_asid_bitmap = kalloc(nested_region_asid_bitmap_size * sizeof(unsigned int)); +#endif bzero(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int)); PMAP_LOCK(subord); @@ -8805,7 +10706,11 @@ pmap_nest_internal( } PMAP_UNLOCK(subord); if (nested_region_asid_bitmap != NULL) { +#if XNU_MONITOR + pmap_pages_free(kvtophys((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE); +#else kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int)); +#endif } } if ((subord->nested_region_subord_addr + subord->nested_region_size) < nend) { @@ -8820,7 +10725,29 @@ pmap_nest_internal( /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */ new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1; +#if XNU_MONITOR + pmap_paddr_t pa = 0; + + if ((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE) { + panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, " + "grand=%p, subord=%p, vstart=0x%llx, nstart=0x%llx, size=%llx", + __FUNCTION__, + new_nested_region_asid_bitmap_size, + grand, subord, vstart, nstart, size); + } + + kr = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); + + if (kr != KERN_SUCCESS) { + return kr; + } + + assert(pa); + + new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa); +#else new_nested_region_asid_bitmap = kalloc(new_nested_region_asid_bitmap_size * sizeof(unsigned int)); +#endif PMAP_LOCK(subord); if (subord->nested_region_size < new_size) { bzero(new_nested_region_asid_bitmap, new_nested_region_asid_bitmap_size * sizeof(unsigned int)); @@ -8834,9 +10761,17 @@ pmap_nest_internal( } PMAP_UNLOCK(subord); if (nested_region_asid_bitmap != NULL) +#if XNU_MONITOR + {pmap_pages_free(kvtophys((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);} +#else { kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size * sizeof(unsigned int));} +#endif if (new_nested_region_asid_bitmap != NULL) +#if XNU_MONITOR + {pmap_pages_free(kvtophys((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);} +#else { kfree(new_nested_region_asid_bitmap, new_nested_region_asid_bitmap_size * sizeof(unsigned int));} +#endif } PMAP_LOCK(subord); @@ -9016,7 +10951,16 @@ pmap_nest( VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), VM_KERNEL_ADDRHIDE(vstart)); +#if XNU_MONITOR + while ((kr = pmap_nest_ppl(grand, subord, vstart, nstart, size)) == KERN_RESOURCE_SHORTAGE) { + pmap_alloc_page_for_ppl(); + } + + pmap_ledger_check_balance(grand); + pmap_ledger_check_balance(subord); +#else kr = pmap_nest_internal(grand, subord, vstart, nstart, size); +#endif PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr); @@ -9197,7 +11141,11 @@ pmap_unnest_options( PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr)); +#if XNU_MONITOR + kr = pmap_unnest_options_ppl(grand, vaddr, size, option); +#else kr = pmap_unnest_options_internal(grand, vaddr, size, option); +#endif PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr); @@ -9471,6 +11419,11 @@ pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, un LOCK_PVH(pai); +#if XNU_MONITOR + if (__improbable(pa_test_monitor(paddr))) { + panic("%s invoked on PPL page 0x%08x", __func__, pn); + } +#endif pmap_update_cache_attributes_locked(pn, new_cacheattr); @@ -9485,7 +11438,11 @@ pmap_map_compressor_page(ppnum_t pn) #if __ARM_PTE_PHYSMAP__ unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK; if (cacheattr != VM_WIMG_DEFAULT) { +#if XNU_MONITOR + pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT); +#else pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT); +#endif } #endif return (void*)phystokv(ptoa(pn)); @@ -9497,7 +11454,11 @@ pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused) #if __ARM_PTE_PHYSMAP__ unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK; if (cacheattr != VM_WIMG_DEFAULT) { +#if XNU_MONITOR + pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr); +#else pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr); +#endif } #endif } @@ -9540,6 +11501,11 @@ pmap_batch_set_cache_attributes_internal( if (doit) { LOCK_PVH(pai); +#if XNU_MONITOR + if (pa_test_monitor(paddr)) { + panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr); + } +#endif } do { @@ -9611,7 +11577,11 @@ pmap_batch_set_cache_attributes( boolean_t doit, unsigned int *res) { +#if XNU_MONITOR + return pmap_batch_set_cache_attributes_ppl(pn, cacheattr, page_cnt, page_index, doit, res); +#else return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res); +#endif } MARK_AS_PMAP_TEXT static void @@ -9640,6 +11610,13 @@ pmap_set_cache_attributes_priv( LOCK_PVH(pai); +#if XNU_MONITOR + if (external && pa_test_monitor(paddr)) { + panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr); + } else if (!external && !pa_test_monitor(paddr)) { + panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr); + } +#endif do { pp_attr_current = pp_attr_table[pai]; @@ -9681,7 +11658,11 @@ pmap_set_cache_attributes( ppnum_t pn, unsigned int cacheattr) { +#if XNU_MONITOR + pmap_set_cache_attributes_ppl(pn, cacheattr); +#else pmap_set_cache_attributes_internal(pn, cacheattr); +#endif } MARK_AS_PMAP_TEXT void @@ -9705,7 +11686,11 @@ pmap_update_cache_attributes_locked( tmplate = *pte_p; tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); +#if XNU_MONITOR + tmplate |= (wimg_to_pte(attributes) & ~ARM_PTE_XPRR_MASK); +#else tmplate |= wimg_to_pte(attributes); +#endif #if (__ARM_VMSA__ > 7) if (tmplate & ARM_PTE_HINT_MASK) { panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx", @@ -9817,8 +11802,13 @@ pmap_create_sharedpage( kern_return_t kr; pmap_paddr_t pa = 0; +#if XNU_MONITOR + pa = pmap_alloc_page_for_kern(); + assert(pa); +#else (void) pmap_pages_alloc(&pa, PAGE_SIZE, 0); +#endif memset((char *) phystokv(pa), 0, PAGE_SIZE); @@ -9895,6 +11885,9 @@ pmap_insert_sharedpage_internal( int options = 0; VALIDATE_PMAP(pmap); +#if XNU_MONITOR + options |= PMAP_OPTIONS_NOWAIT; +#endif /* XNU_MONITOR */ #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE #error We assume a single page. @@ -9938,6 +11931,11 @@ pmap_insert_sharedpage_internal( kr = pmap_expand(pmap, sharedpage_vaddr, options, PMAP_TT_L2_LEVEL); if (kr != KERN_SUCCESS) { +#if XNU_MONITOR + if (kr == KERN_RESOURCE_SHORTAGE) { + return kr; + } else +#endif { panic("Failed to pmap_expand for commpage, pmap=%p", pmap); } @@ -10029,7 +12027,24 @@ void pmap_insert_sharedpage( pmap_t pmap) { +#if XNU_MONITOR + kern_return_t kr = KERN_FAILURE; + + while ((kr = pmap_insert_sharedpage_ppl(pmap)) == KERN_RESOURCE_SHORTAGE) { + pmap_alloc_page_for_ppl(); + } + + pmap_ledger_check_balance(pmap); + + if (kr != KERN_SUCCESS) { + panic("%s: failed to insert the shared page, kr=%d, " + "pmap=%p", + __FUNCTION__, kr, + pmap); + } +#else pmap_insert_sharedpage_internal(pmap); +#endif } static boolean_t @@ -10139,7 +12154,11 @@ pmap_is_empty( vm_map_offset_t va_start, vm_map_offset_t va_end) { +#if XNU_MONITOR + return pmap_is_empty_ppl(pmap, va_start, va_end); +#else return pmap_is_empty_internal(pmap, va_start, va_end); +#endif } vm_map_offset_t @@ -10265,6 +12284,124 @@ pmap_flush( return; } +#if XNU_MONITOR + +/* + * Enforce that the address range described by kva and nbytes is not currently + * PPL-owned, and won't become PPL-owned while pinned. This is to prevent + * unintentionally writing to PPL-owned memory. + */ +static void +pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes) +{ + vm_offset_t end; + if (os_add_overflow(kva, nbytes, &end)) { + panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes); + } + for (vm_offset_t ckva = kva; ckva < end; ckva = round_page(ckva + 1)) { + pmap_paddr_t pa = kvtophys(ckva); + if (!pa_valid(pa)) { + panic("%s(%p): invalid physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa); + } + pp_attr_t attr; + unsigned int pai = (unsigned int)pa_index(pa); + if (ckva == phystokv(pa)) { + panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa); + } + do { + attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR; + if (attr & PP_ATTR_MONITOR) { + panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa); + } + } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai])); + } +} + +static void +pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes) +{ + vm_offset_t end; + if (os_add_overflow(kva, nbytes, &end)) { + panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes); + } + for (vm_offset_t ckva = kva; ckva < end; ckva = round_page(ckva + 1)) { + pmap_paddr_t pa = kvtophys(ckva); + if (!pa_valid(pa)) { + panic("%s(%p): invalid physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa); + } + if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) { + panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa); + } + assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR)); + pa_clear_no_monitor(pa); + } +} + +/* + * Lock down a page, making all mappings read-only, and preventing + * further mappings or removal of this particular kva's mapping. + * Effectively, it makes the page at kva immutable. + */ +MARK_AS_PMAP_TEXT static void +pmap_ppl_lockdown_page(vm_address_t kva) +{ + pmap_paddr_t pa = kvtophys(kva); + unsigned int pai = (unsigned int)pa_index(pa); + LOCK_PVH(pai); + pv_entry_t **pv_h = pai_to_pvh(pai); + + if (pa_test_monitor(pa)) { + panic("%#lx: page %llx belongs to PPL", kva, pa); + } + + if (pvh_get_flags(pv_h) & (PVH_FLAG_LOCKDOWN | PVH_FLAG_EXEC)) { + panic("%#lx: already locked down/executable (%#llx)", kva, pvh_get_flags(pv_h)); + } + + pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva); + + if (pte_p == PT_ENTRY_NULL) { + panic("%#lx: NULL pte", kva); + } + + pt_entry_t tmplate = *pte_p; + if ((tmplate & ARM_PTE_APMASK) != ARM_PTE_AP(AP_RWNA)) { + panic("%#lx: not a kernel r/w page (%#llx)", kva, tmplate & ARM_PTE_APMASK); + } + + pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_LOCKDOWN); + + pmap_set_ptov_ap(pai, AP_RONA, FALSE); + + UNLOCK_PVH(pai); + + pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0); +} + +/* + * Release a page from being locked down to the PPL, making it writable + * to the kernel once again. + */ +MARK_AS_PMAP_TEXT static void +pmap_ppl_unlockdown_page(vm_address_t kva) +{ + pmap_paddr_t pa = kvtophys(kva); + unsigned int pai = (unsigned int)pa_index(pa); + LOCK_PVH(pai); + pv_entry_t **pv_h = pai_to_pvh(pai); + + vm_offset_t pvh_flags = pvh_get_flags(pv_h); + + if (!(pvh_flags & PVH_FLAG_LOCKDOWN)) { + panic("unlockdown attempt on not locked down virtual %#lx/pai %d", kva, pai); + } + + pvh_set_flags(pv_h, pvh_flags & ~PVH_FLAG_LOCKDOWN); + pmap_set_ptov_ap(pai, AP_RWNA, FALSE); + UNLOCK_PVH(pai); +} + +#else /* XNU_MONITOR */ static void __unused pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused) @@ -10276,6 +12413,7 @@ pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused) { } +#endif /* !XNU_MONITOR */ #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1) @@ -10378,7 +12516,11 @@ pmap_query_resident( if (l > end) { l = end; } +#if XNU_MONITOR + resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p); +#else resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p); +#endif if (resident_bytes == PMAP_RESIDENT_INVALID) { break; } @@ -11403,7 +13545,11 @@ void pmap_set_jit_entitled( pmap_t pmap) { +#if XNU_MONITOR + pmap_set_jit_entitled_ppl(pmap); +#else pmap_set_jit_entitled_internal(pmap); +#endif } MARK_AS_PMAP_TEXT static kern_return_t @@ -11483,7 +13629,11 @@ pmap_query_page_info( vm_map_offset_t va, int *disp_p) { +#if XNU_MONITOR + return pmap_query_page_info_ppl(pmap, va, disp_p); +#else return pmap_query_page_info_internal(pmap, va, disp_p); +#endif } MARK_AS_PMAP_TEXT kern_return_t @@ -11496,7 +13646,11 @@ pmap_return_internal(__unused boolean_t do_panic, __unused boolean_t do_recurse) kern_return_t pmap_return(boolean_t do_panic, boolean_t do_recurse) { +#if XNU_MONITOR + return pmap_return_ppl(do_panic, do_recurse); +#else return pmap_return_internal(do_panic, do_recurse); +#endif } @@ -11525,7 +13679,11 @@ pmap_footprint_suspend( vm_map_t map, boolean_t suspend) { +#if XNU_MONITOR + pmap_footprint_suspend_ppl(map, suspend); +#else pmap_footprint_suspend_internal(map, suspend); +#endif } #if defined(__arm64__) && (DEVELOPMENT || DEBUG) diff --git a/osfmk/arm/pmap.h b/osfmk/arm/pmap.h index e56129770..4cfec3ecf 100644 --- a/osfmk/arm/pmap.h +++ b/osfmk/arm/pmap.h @@ -79,6 +79,13 @@ #define CPUWINDOWS_MAX 4 struct pmap_cpu_data { +#if XNU_MONITOR + uint64_t cpu_id; + void * ppl_kern_saved_sp; + void * ppl_stack; + arm_context_t * save_area; + unsigned int ppl_state; +#endif #if defined(__arm64__) pmap_t cpu_nested_pmap; #else @@ -212,6 +219,9 @@ extern void set_mmu_ttb_alternate(uint64_t); extern uint64_t get_tcr(void); extern void set_tcr(uint64_t); extern uint64_t pmap_get_arm64_prot(pmap_t, vm_offset_t); +#if defined(HAS_VMSA_LOCK) +extern void vmsa_lock(void); +#endif #else extern uint32_t get_mmu_control(void); extern void set_mmu_control(uint32_t); @@ -393,6 +403,10 @@ extern void pmap_gc(void); #if defined(__arm64__) extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va); #endif +#if HAS_APPLE_PAC && XNU_MONITOR +extern void * pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t data); +extern void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t data); +#endif /* HAS_APPLE_PAC && XNU_MONITOR */ /* * Interfaces implemented as macros. @@ -538,6 +552,10 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap); #define PMAP_LEDGER_ALLOC_INDEX 66 #define PMAP_LEDGER_FREE_INDEX 67 +#if HAS_APPLE_PAC && XNU_MONITOR +#define PMAP_SIGN_USER_PTR 68 +#define PMAP_AUTH_USER_PTR 69 +#endif /* HAS_APPLE_PAC && XNU_MONITOR */ #define PMAP_COUNT 71 @@ -554,23 +572,82 @@ extern void pmap_cpu_data_init(void); /* Get the pmap per-CPU data for the current CPU. */ extern pmap_cpu_data_t * pmap_get_cpu_data(void); +#if XNU_MONITOR +extern boolean_t pmap_ppl_locked_down; + +/* + * Denotes the bounds of the PPL stacks. These are visible so that other code + * can check if addresses are part of the PPL stacks. + */ +extern void * pmap_stacks_start; +extern void * pmap_stacks_end; + +/* Asks if a page belongs to the monitor. */ +extern boolean_t pmap_is_monitor(ppnum_t pn); + +/* + * Indicates that we are done with our static bootstrap + * allocations, so the monitor may now mark the pages + * that it owns. + */ +extern void pmap_static_allocations_done(void); + +/* + * Indicates that we are done mutating sensitive state in the system, and that + * the PPL may now restict write access to PPL owned mappings. + */ +extern void pmap_lockdown_ppl(void); + + +#ifdef KASAN +#define PPL_STACK_SIZE (PAGE_SIZE << 2) +#else +#define PPL_STACK_SIZE PAGE_SIZE +#endif + +/* One stack for each CPU, plus a guard page below each stack and above the last stack */ +#define PPL_STACK_REGION_SIZE ((MAX_CPUS * (PPL_STACK_SIZE + ARM_PGBYTES)) + ARM_PGBYTES) + +#define PPL_DATA_SEGMENT_SECTION_NAME "__PPLDATA,__data" +#define PPL_TEXT_SEGMENT_SECTION_NAME "__PPLTEXT,__text,regular,pure_instructions" +#define PPL_DATACONST_SEGMENT_SECTION_NAME "__PPLDATA,__const" + +#define MARK_AS_PMAP_DATA \ + __PLACE_IN_SECTION(PPL_DATA_SEGMENT_SECTION_NAME) +#define MARK_AS_PMAP_TEXT \ + __attribute__((used, section(PPL_TEXT_SEGMENT_SECTION_NAME), noinline)) +#define MARK_AS_PMAP_RODATA \ + __PLACE_IN_SECTION(PPL_DATACONST_SEGMENT_SECTION_NAME) + +#else /* XNU_MONITOR */ #define MARK_AS_PMAP_TEXT #define MARK_AS_PMAP_DATA #define MARK_AS_PMAP_RODATA +#endif /* !XNU_MONITOR */ extern kern_return_t pmap_return(boolean_t do_panic, boolean_t do_recurse); extern lck_grp_t pmap_lck_grp; +#if XNU_MONITOR +extern void CleanPoC_DcacheRegion_Force_nopreempt(vm_offset_t va, unsigned length); +#define pmap_force_dcache_clean(va, sz) CleanPoC_DcacheRegion_Force_nopreempt(va, sz) +#define pmap_simple_lock(l) simple_lock_nopreempt(l, &pmap_lck_grp) +#define pmap_simple_unlock(l) simple_unlock_nopreempt(l) +#define pmap_simple_lock_try(l) simple_lock_try_nopreempt(l, &pmap_lck_grp) +#define pmap_lock_bit(l, i) hw_lock_bit_nopreempt(l, i, &pmap_lck_grp) +#define pmap_unlock_bit(l, i) hw_unlock_bit_nopreempt(l, i) +#else #define pmap_force_dcache_clean(va, sz) CleanPoC_DcacheRegion_Force(va, sz) #define pmap_simple_lock(l) simple_lock(l, &pmap_lck_grp) #define pmap_simple_unlock(l) simple_unlock(l) #define pmap_simple_lock_try(l) simple_lock_try(l, &pmap_lck_grp) #define pmap_lock_bit(l, i) hw_lock_bit(l, i, &pmap_lck_grp) #define pmap_unlock_bit(l, i) hw_unlock_bit(l, i) +#endif #endif /* #ifndef ASSEMBLER */ diff --git a/osfmk/arm/proc_reg.h b/osfmk/arm/proc_reg.h index 192bc9d69..c5921cede 100644 --- a/osfmk/arm/proc_reg.h +++ b/osfmk/arm/proc_reg.h @@ -164,6 +164,50 @@ #define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64) #define __ARM_CLUSTER_COUNT__ 2 +#elif defined (APPLEVORTEX) +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_GLOBAL_SLEEP_BIT__ 1 +#define __ARM_PAN_AVAILABLE__ 1 +#define __ARM_WKDM_ISA_AVAILABLE__ 1 +#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL) +#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64) +#define __ARM_CLUSTER_COUNT__ 2 + +#elif defined (APPLELIGHTNING) +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_AMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_GLOBAL_SLEEP_BIT__ 1 +#define __ARM_PAN_AVAILABLE__ 1 +#define __ARM_WKDM_ISA_AVAILABLE__ 1 +#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL) +#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64) +#define __ARM_CLUSTER_COUNT__ 2 +#define +#define __APCFG_SUPPORTED__ 1 +#define __ARM_RANGE_TLBI__ 1 + #elif defined (BCM2837) #define __ARM_ARCH__ 8 #define __ARM_VMSA__ 8 @@ -506,6 +550,72 @@ #define L2_SWAY (L2_CSIZE - L2_NWAY) /* set size 1< ARM_KERNEL static_assert((((~ARM_KERNEL_PROTECT_EXCEPTION_START) + 1) * 2ULL) <= (ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK)); #endif /* __ARM_KERNEL_PROTECT__ */ +#if __APRR_SUPPORTED__ && XNU_MONITOR +/* + * If APRR is supported, setting XN on L1/L2 table entries will shift the effective + * APRR index of L3 PTEs covering PPL-protected pages in the kernel dynamic region + * from PPL R/W to kernel R/W. That will effectively remove PPL write protection + * from those pages. Avoid setting XN at the table level for MONITOR-enabled builds + * that are backed by APRR. + */ +#define ARM_DYNAMIC_TABLE_XN ARM_TTE_TABLE_PXN +#else #define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN) +#endif #if KASAN extern vm_offset_t shadow_pbase; @@ -194,6 +205,18 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) segLOWEST; SECURITY_READ_ONLY_LATE(static vm_offset_t) segTEXTB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT; +#if XNU_MONITOR +SECURITY_READ_ONLY_LATE(vm_offset_t) segPPLTEXTB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizePPLTEXT; + +SECURITY_READ_ONLY_LATE(vm_offset_t) segPPLTRAMPB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizePPLTRAMP; + +SECURITY_READ_ONLY_LATE(vm_offset_t) segPPLDATACONSTB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizePPLDATACONST; +SECURITY_READ_ONLY_LATE(void *) pmap_stacks_start = NULL; +SECURITY_READ_ONLY_LATE(void *) pmap_stacks_end = NULL; +#endif SECURITY_READ_ONLY_LATE(static vm_offset_t) segDATACONSTB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATACONST; @@ -204,6 +227,10 @@ SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXTEXEC; SECURITY_READ_ONLY_LATE(static vm_offset_t) segDATAB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATA; +#if XNU_MONITOR +SECURITY_READ_ONLY_LATE(vm_offset_t) segPPLDATAB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizePPLDATA; +#endif SECURITY_READ_ONLY_LATE(vm_offset_t) segBOOTDATAB; SECURITY_READ_ONLY_LATE(unsigned long) segSizeBOOTDATA; @@ -351,7 +378,7 @@ round_up_pte_hint_address(vm_offset_t address) vm_offset_t alloc_ptpage(boolean_t map_static) { vm_offset_t vaddr; -#if !(defined(KERNEL_INTEGRITY_KTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) map_static = FALSE; #endif @@ -480,7 +507,7 @@ void dump_kva_space() { #endif /* DEBUG */ -#if __ARM_KERNEL_PROTECT__ +#if __ARM_KERNEL_PROTECT__ || XNU_MONITOR /* * arm_vm_map: * root_ttp: The kernel virtual address for the root of the target page tables @@ -555,7 +582,7 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) *ptep = pte; } -#endif // __ARM_KERNEL_PROTECT +#endif // __ARM_KERNEL_PROTECT || XNU_MONITOR #if __ARM_KERNEL_PROTECT__ @@ -712,7 +739,7 @@ arm_vm_expand_kernel_el0_mappings(void) } #endif /* __ARM_KERNEL_PROTECT__ */ -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) extern void bootstrap_instructions; /* @@ -777,7 +804,7 @@ static void arm_replace_identity_map(boot_args * args) ARM_PTE_AP(AP_RONA) | ARM_PTE_NX; } -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ tt_entry_t *arm_kva_to_tte(vm_offset_t); @@ -791,6 +818,16 @@ arm_kva_to_tte(vm_offset_t va) return tte2; } +#if XNU_MONITOR + +static inline pt_entry_t * +arm_kva_to_pte(vm_offset_t va) +{ + tt_entry_t *tte2 = arm_kva_to_tte(va); + return L3_TABLE_VA(tte2) + L3_TABLE_INDEX(va); +} + +#endif #define ARM64_GRANULE_ALLOW_BLOCK (1 << 0) #define ARM64_GRANULE_ALLOW_HINT (1 << 1) @@ -1096,13 +1133,34 @@ arm_vm_prot_init(boot_args * args) * NO, stuff in this segment gets modified during startup (viz. mac_policy_init()/mac_policy_list) * Make RNX in prot_finalize */ +#if XNU_MONITOR + /* The ropagetable region will ultimately be owned by the PPL. Set permissions + * on it separately to avoid applying mismatched block settings between this function, + * pmap_static_allocations_done(), and arm_vm_prot_finalize(). */ + vm_offset_t segDATACONSTE = segDATACONSTB + segSizeDATACONST; + + arm_vm_page_granular_RWNX(segDATACONSTB, (vm_offset_t)&ropagetable_begin - segDATACONSTB, ARM64_GRANULE_ALLOW_BLOCK); + arm_vm_page_granular_RWNX((vm_offset_t)&ropagetable_begin, + (vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin, ARM64_GRANULE_ALLOW_BLOCK); + arm_vm_page_granular_RWNX((vm_offset_t)&ropagetable_end, + segDATACONSTE - (vm_offset_t)&ropagetable_end, ARM64_GRANULE_ALLOW_BLOCK); +#else arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK); +#endif arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); +#if XNU_MONITOR + arm_vm_page_granular_ROX(segPPLTEXTB, segSizePPLTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + arm_vm_page_granular_ROX(segPPLTRAMPB, segSizePPLTRAMP, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + arm_vm_page_granular_RNX(segPPLDATACONSTB, segSizePPLDATACONST, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); +#endif /* DATA segment will remain RWNX */ arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); +#if XNU_MONITOR + arm_vm_page_granular_RWNX(segPPLDATAB, segSizePPLDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); +#endif arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, 0); arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, 0); @@ -1179,6 +1237,67 @@ arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap ++ptov_index; } +#if XNU_MONITOR + +SECURITY_READ_ONLY_LATE(static boolean_t) keep_linkedit = FALSE; + +static void +arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused) +{ + ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE]; + bzero(temp_ptov_table, sizeof(temp_ptov_table)); + + // This is memory that will either be handed back to the VM layer via ml_static_mfree(), + // or will be available for general-purpose use. Physical aperture mappings for this memory + // must be at page granularity, so that PPL ownership or cache attribute changes can be reflected + // in the physical aperture mappings. + + + // Slid region between gPhysBase and beginning of protected text + arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0); + + // kext bootstrap segment + arm_vm_physmap_slide(temp_ptov_table, physmap_base, segKLDB, segSizeKLD, AP_RONA, 0); + + // Early-boot data + arm_vm_physmap_slide(temp_ptov_table, physmap_base, segBOOTDATAB, segSizeBOOTDATA, AP_RONA, 0); + +#if KASAN_DYNAMIC_BLACKLIST + /* KASAN's dynamic blacklist needs to query the LINKEDIT segment at runtime. As such, the + * kext bootstrap code will not jettison LINKEDIT on kasan kernels, so don't bother to relocate it. */ + keep_linkedit = TRUE; +#else + PE_parse_boot_argn("keepsyms", &keep_linkedit, sizeof(keep_linkedit)); +#endif + if (!keep_linkedit) { + // Kernel LINKEDIT + arm_vm_physmap_slide(temp_ptov_table, physmap_base, segLINKB, segSizeLINK, AP_RWNA, 0); + + // Prelinked kernel LINKEDIT + arm_vm_physmap_slide(temp_ptov_table, physmap_base, segPLKLINKEDITB, segSizePLKLINKEDIT, AP_RWNA, 0); + } + + // Prelinked kernel plists + arm_vm_physmap_slide(temp_ptov_table, physmap_base, segPRELINKINFOB, segSizePRELINKINFO, AP_RWNA, 0); + + // Device tree, ramdisk, boot args + arm_vm_physmap_slide(temp_ptov_table, physmap_base, end_kern, (args->topOfKernelData - gPhysBase + gVirtBase) - end_kern, AP_RWNA, 0); + PE_slide_devicetree(temp_ptov_table[ptov_index - 1].va - end_kern); + + // Remainder of physical memory + arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase), + real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, 0); + + assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin); + + // Sort in descending order of segment length. LUT traversal is linear, so largest (most likely used) + // segments should be placed earliest in the table to optimize lookup performance. + qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); + + memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table)); +} + +#else static void arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused) @@ -1205,6 +1324,7 @@ arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_addre memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table)); } +#endif // XNU_MONITOR void arm_vm_prot_finalize(boot_args * args __unused) @@ -1248,8 +1368,35 @@ arm_vm_prot_finalize(boot_args * args __unused) arm_vm_populate_kernel_el0_mappings(); #endif /* __ARM_KERNEL_PROTECT__ */ +#if XNU_MONITOR + for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } + /* Clear the original stack mappings; these pages should be mapped through ptov_table. */ + for (vm_offset_t va = segBOOTDATAB; va < (segBOOTDATAB + segSizeBOOTDATA); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } + /* Clear the original PRELINKINFO mapping. This segment should be jettisoned during I/O Kit + * initialization before we reach this point. */ + for (vm_offset_t va = segPRELINKINFOB; va < (segPRELINKINFOB + segSizePRELINKINFO); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } + if (!keep_linkedit) { + for (vm_offset_t va = segLINKB; va < (segLINKB + segSizeLINK); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } + for (vm_offset_t va = segPLKLINKEDITB; va < (segPLKLINKEDITB + segSizePLKLINKEDIT); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } + } +#endif /* XNU_MONITOR */ -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* * __LAST,__pinst should no longer be executable. */ @@ -1262,7 +1409,20 @@ arm_vm_prot_finalize(boot_args * args __unused) */ #endif +#if XNU_MONITOR + vm_offset_t segDATACONSTE = segDATACONSTB + segSizeDATACONST; + + /* + * For the moment, the RO pagetable allocation is part of the + * constant data segment, but it is technically owned by the + * PPL. Hence, we should not reprotect it. + */ + arm_vm_page_granular_RNX(segDATACONSTB, (vm_offset_t)&ropagetable_begin - segDATACONSTB, ARM64_GRANULE_ALLOW_BLOCK); + arm_vm_page_granular_RNX((vm_offset_t)&ropagetable_end, + segDATACONSTE - (vm_offset_t)&ropagetable_end, ARM64_GRANULE_ALLOW_BLOCK); +#else arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK); +#endif __builtin_arm_dsb(DSB_ISH); flush_mmu_tlb(); @@ -1363,12 +1523,22 @@ arm_vm_init(uint64_t memory_size, boot_args * args) physmap_base += physmap_slide; +#if XNU_MONITOR + physmap_base = ROUND_TWIG(physmap_base); + static_memory_end = physmap_base + mem_size; +#else static_memory_end = physmap_base + mem_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE); // worst possible case for block alignment +#endif #if KASAN /* add the KASAN stolen memory to the physmap */ dynamic_memory_begin = static_memory_end + (shadow_ptop - shadow_pbase); #else dynamic_memory_begin = static_memory_end; +#endif +#if XNU_MONITOR + pmap_stacks_start = (void*)dynamic_memory_begin; + dynamic_memory_begin += PPL_STACK_REGION_SIZE; + pmap_stacks_end = (void*)dynamic_memory_begin; #endif if (dynamic_memory_begin > VM_MAX_KERNEL_ADDRESS) panic("Unsupported memory configuration %lx\n", mem_size); @@ -1394,7 +1564,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ avail_start = boot_ttep + BOOTSTRAP_TABLE_SIZE; -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) arm_replace_identity_map(args); #endif @@ -1460,7 +1630,15 @@ arm_vm_init(uint64_t memory_size, boot_args * args) segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &segSizeTEXT); segDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA_CONST", &segSizeDATACONST); segTEXTEXECB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT_EXEC", &segSizeTEXTEXEC); +#if XNU_MONITOR + segPPLTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTEXT", &segSizePPLTEXT); + segPPLTRAMPB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTRAMP", &segSizePPLTRAMP); + segPPLDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA_CONST", &segSizePPLDATACONST); +#endif segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA); +#if XNU_MONITOR + segPPLDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA", &segSizePPLDATA); +#endif segBOOTDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA); segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK); @@ -1566,6 +1744,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args) set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); flush_mmu_tlb(); +#if defined(HAS_VMSA_LOCK) + vmsa_lock(); +#endif kva_active = TRUE; // global table pointers may need to be different due to physical aperture remapping cpu_tte = (tt_entry_t*)(phystokv(cpu_ttep)); @@ -1582,6 +1763,28 @@ arm_vm_init(uint64_t memory_size, boot_args * args) vm_kernel_etext = segTEXTB + segSizeTEXT + segSizeDATACONST + segSizeTEXTEXEC; dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin); +#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) + // reserve a 32MB region without permission overrides to use later for a CTRR unit test + { + extern vm_offset_t ctrr_test_page; + tt_entry_t *new_tte; + + ctrr_test_page = dynamic_memory_begin; + dynamic_memory_begin += ARM_TT_L2_SIZE; + cpu_l1_tte = cpu_tte + ((ctrr_test_page & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + assert((*cpu_l1_tte) & ARM_TTE_VALID); + cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((ctrr_test_page & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); + assert((*cpu_l2_tte) == ARM_TTE_EMPTY); + new_tte = (tt_entry_t *)alloc_ptpage(FALSE); + bzero(new_tte, ARM_PGBYTES); + *cpu_l2_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + } +#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ +#if XNU_MONITOR + for (vm_offset_t cur = (vm_offset_t)pmap_stacks_start; cur < (vm_offset_t)pmap_stacks_end; cur += ARM_PGBYTES) { + arm_vm_map(cpu_tte, cur, ARM_PTE_EMPTY); + } +#endif pmap_bootstrap(dynamic_memory_begin); disable_preemption(); @@ -1708,6 +1911,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK; +#if XNU_MONITOR + pmap_static_allocations_done(); +#endif first_avail = avail_start; patch_low_glo_static_region(args->topOfKernelData, avail_start - args->topOfKernelData); enable_preemption(); diff --git a/osfmk/arm64/cpu.c b/osfmk/arm64/cpu.c index 2360e6982..1ab9c9f1d 100644 --- a/osfmk/arm64/cpu.c +++ b/osfmk/arm64/cpu.c @@ -90,6 +90,9 @@ extern void typhoon_prepare_for_wfi(void); extern void typhoon_return_from_wfi(void); #endif +#if HAS_RETENTION_STATE +extern void arm64_retention_wfi(void); +#endif vm_address_t start_cpu_paddr; @@ -402,7 +405,11 @@ cpu_idle(void) typhoon_prepare_for_wfi(); #endif __builtin_arm_dsb(DSB_SY); +#if HAS_RETENTION_STATE + arm64_retention_wfi(); +#else __builtin_arm_wfi(); +#endif #if defined(APPLETYPHOON) // CPU1 Stuck in WFIWT Because of MMU Prefetch @@ -646,6 +653,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) cpu_data_ptr->coresight_base[i] = 0; } +#if !XNU_MONITOR pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data; pmap_cpu_data_ptr->cpu_nested_pmap = (struct pmap *) NULL; @@ -654,6 +662,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) { pmap_cpu_data_ptr->cpu_asid_high_bits[i] = 0; } +#endif cpu_data_ptr->halt_status = CPU_NOT_HALTED; #if __ARM_KERNEL_PROTECT__ cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table; @@ -681,6 +690,20 @@ cpu_data_register(cpu_data_t *cpu_data_ptr) return KERN_SUCCESS; } +#if defined(KERNEL_INTEGRITY_CTRR) + +lck_spin_t ctrr_cpu_start_lck; +bool ctrr_cluster_locked[__ARM_CLUSTER_COUNT__]; + +void +init_ctrr_cpu_start_lock(void) +{ + lck_grp_t *ctrr_cpu_start_lock_grp = lck_grp_alloc_init("ctrr_cpu_start_lock", 0); + assert(ctrr_cpu_start_lock_grp); + lck_spin_init(&ctrr_cpu_start_lck, ctrr_cpu_start_lock_grp, NULL); +} + +#endif kern_return_t cpu_start(int cpu) @@ -697,7 +720,9 @@ cpu_start(int cpu) cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr; +#if !XNU_MONITOR cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL; +#endif if (cpu_data_ptr->cpu_processor->startup_thread != THREAD_NULL) { first_thread = cpu_data_ptr->cpu_processor->startup_thread; @@ -711,6 +736,22 @@ cpu_start(int cpu) flush_dcache((vm_offset_t)&CpuDataEntries[cpu], sizeof(cpu_data_entry_t), FALSE); flush_dcache((vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t), FALSE); +#if defined(KERNEL_INTEGRITY_CTRR) + /* first time CPU starts, if not cluster master, and if cluster is not already locked, + * block until cluster becomes locked. */ + if (cpu_data_ptr->cpu_processor->active_thread == THREAD_NULL + && !cpu_data_ptr->cluster_master) { + lck_spin_lock(&ctrr_cpu_start_lck); + if (ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] == 0) { + assert_wait(&ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id], THREAD_UNINT); + lck_spin_unlock(&ctrr_cpu_start_lck); + thread_block(THREAD_CONTINUE_NULL); + assert(ctrr_cluster_locked[cpu_data_ptr->cpu_cluster_id] == 1); + } else { + lck_spin_unlock(&ctrr_cpu_start_lck); + } + } +#endif (void) PE_cpu_start(cpu_data_ptr->cpu_id, (vm_offset_t)NULL, (vm_offset_t)NULL); } diff --git a/osfmk/arm64/exception_asm.h b/osfmk/arm64/exception_asm.h index 41bfa1f68..60aa8b83f 100644 --- a/osfmk/arm64/exception_asm.h +++ b/osfmk/arm64/exception_asm.h @@ -30,6 +30,71 @@ #include #endif +#if XNU_MONITOR +/* Exit path defines; for controlling PPL -> kernel transitions. */ +#define PPL_EXIT_DISPATCH 0 /* This is a clean exit after a PPL request. */ +#define PPL_EXIT_PANIC_CALL 1 /* The PPL has called panic. */ +#define PPL_EXIT_BAD_CALL 2 /* The PPL request failed. */ +#define PPL_EXIT_EXCEPTION 3 /* The PPL took an exception. */ + +#define KERNEL_MODE_ELR ELR_GL11 +#define KERNEL_MODE_FAR FAR_GL11 +#define KERNEL_MODE_ESR ESR_GL11 +#define KERNEL_MODE_SPSR SPSR_GL11 +#define KERNEL_MODE_ASPSR ASPSR_GL11 +#define KERNEL_MODE_VBAR VBAR_GL11 +#define KERNEL_MODE_TPIDR TPIDR_GL11 + +#define GUARDED_MODE_ELR ELR_EL1 +#define GUARDED_MODE_FAR FAR_EL1 +#define GUARDED_MODE_ESR ESR_EL1 +#define GUARDED_MODE_SPSR SPSR_EL1 +#define GUARDED_MODE_ASPSR ASPSR_EL1 +#define GUARDED_MODE_VBAR VBAR_EL1 +#define GUARDED_MODE_TPIDR TPIDR_EL1 + +/* + * GET_PMAP_CPU_DATA + * + * Retrieves the PPL per-CPU data for the current CPU. + * arg0 - Address of the PPL per-CPU data is returned through this + * arg1 - Scratch register + * arg2 - Scratch register + * + */ +.macro GET_PMAP_CPU_DATA +/* Get the CPU ID. */ +mrs $0, MPIDR_EL1 +#ifdef CPU_CLUSTER_OFFSETS +ubfx $1, $0, MPIDR_AFF1_SHIFT, MPIDR_AFF1_WIDTH +cmp $1, __ARM_CLUSTER_COUNT__ +b.hs . +adrp $2, EXT(pmap_cluster_offsets)@page +add $2, $2, EXT(pmap_cluster_offsets)@pageoff +ldr $1, [$2, $1, lsl #3] +and $0, $0, MPIDR_AFF0_MASK +add $0, $0, $1 +#else +and $0, $0, MPIDR_AFF0_MASK +#endif + +/* Get the PPL CPU data array. */ +adrp $1, EXT(pmap_cpu_data_array)@page +add $1, $1, EXT(pmap_cpu_data_array)@pageoff + +/* + * Sanity check the CPU ID (this is not a panic because this pertains to + * the hardware configuration; this should only fail if our + * understanding of the hardware is incorrect). + */ +cmp $0, MAX_CPUS +b.hs . + +mov $2, PMAP_CPU_DATA_ARRAY_ENTRY_SIZE +/* Get the PPL per-CPU data. */ +madd $0, $0, $2, $1 +.endmacro +#endif /* XNU_MONITOR */ /* * INIT_SAVED_STATE_FLAVORS diff --git a/osfmk/arm64/genassym.c b/osfmk/arm64/genassym.c index c47c6ab1a..511460bdc 100644 --- a/osfmk/arm64/genassym.c +++ b/osfmk/arm64/genassym.c @@ -257,6 +257,14 @@ main(int argc, DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1)); +#if XNU_MONITOR + DECLARE("PMAP_CPU_DATA_PPL_STATE", offsetof(struct pmap_cpu_data, ppl_state)); + DECLARE("PMAP_CPU_DATA_ARRAY_ENTRY_SIZE", sizeof(struct pmap_cpu_data_array_entry)); + DECLARE("PMAP_CPU_DATA_PPL_STACK", offsetof(struct pmap_cpu_data, ppl_stack)); + DECLARE("PMAP_CPU_DATA_KERN_SAVED_SP", offsetof(struct pmap_cpu_data, ppl_kern_saved_sp)); + DECLARE("PMAP_CPU_DATA_SAVE_AREA", offsetof(struct pmap_cpu_data, save_area)); + DECLARE("PMAP_COUNT", PMAP_COUNT); +#endif /* XNU_MONITOR */ #if defined(HAS_APPLE_PAC) diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index f9162a819..660a59f1b 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -40,6 +40,180 @@ #include #endif +#if XNU_MONITOR +/* + * CHECK_EXCEPTION_RETURN_DISPATCH_PPL + * + * Checks if an exception was taken from the PPL, and if so, trampolines back + * into the PPL. + * x26 - 0 if the exception was taken while in the kernel, 1 if the + * exception was taken while in the PPL. + */ +.macro CHECK_EXCEPTION_RETURN_DISPATCH_PPL + cmp x26, xzr + b.eq 1f + + /* Return to the PPL. */ + mov x15, #0 + mov w10, #PPL_STATE_EXCEPTION +#if __APRR_SUPPORTED__ + b Ldisable_aif_and_enter_ppl +#else +#error "XPRR configuration error" +#endif /* __APRR_SUPPORTED__ */ +1: +.endmacro + +#if __APRR_SUPPORTED__ +/* + * EL1_SP0_VECTOR_PPL_CHECK + * + * Check to see if the exception was taken by the kernel or the PPL. Falls + * through if kernel, hands off to the given label if PPL. Expects to run on + * SP1. + * arg0 - Label to go to if this was a PPL exception. + */ +.macro EL1_SP0_VECTOR_PPL_CHECK + sub sp, sp, ARM_CONTEXT_SIZE + stp x0, x1, [sp, SS64_X0] + mrs x0, APRR_EL1 + MOV64 x1, APRR_EL1_DEFAULT + cmp x0, x1 + b.ne $0 + ldp x0, x1, [sp, SS64_X0] + add sp, sp, ARM_CONTEXT_SIZE +.endmacro + +#define STAY_ON_SP1 0 +#define SWITCH_TO_SP0 1 + +#define INVOKE_PREFLIGHT 0 +#define NO_INVOKE_PREFLIGHT 1 + +/* + * EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE + * + * Verify whether an exception came from the PPL or from the kernel. If it came + * from the PPL, save off the PPL state and transition out of the PPL. + * arg0 - Label to go to if this was a kernel exception + * arg1 - Label to go to (after leaving the PPL) if this was a PPL exception + * arg2 - Indicates if this should switch back to SP0 + * x0 - xPRR_EL1_BR1 read by EL1_SP0_VECTOR_PPL_CHECK + */ +.macro EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE + /* Spill some more registers. */ + stp x2, x3, [sp, SS64_X2] + + /* + * Check if the PPL is locked down; if not, we can treat this as a + * kernel execption. + */ + adrp x1, EXT(pmap_ppl_locked_down)@page + ldr w1, [x1, #EXT(pmap_ppl_locked_down)@pageoff] + cbz x1, 2f + + /* Ensure that APRR_EL1 is actually in PPL mode. */ + MOV64 x1, APRR_EL1_PPL + cmp x0, x1 + b.ne . + + /* + * Check if the CPU is in the PPL; if not we can treat this as a + * kernel exception. + */ + GET_PMAP_CPU_DATA x3, x1, x2 + ldr w1, [x3, PMAP_CPU_DATA_PPL_STATE] + cmp x1, #PPL_STATE_KERNEL + b.eq 2f + + /* Ensure that the CPU is in the expected PPL state. */ + cmp x1, #PPL_STATE_DISPATCH + b.ne . + + /* Mark the CPU as dealing with an exception. */ + mov x1, #PPL_STATE_EXCEPTION + str w1, [x3, PMAP_CPU_DATA_PPL_STATE] + + /* Load the bounds of the PPL trampoline. */ + adrp x0, EXT(ppl_no_exception_start)@page + add x0, x0, EXT(ppl_no_exception_start)@pageoff + adrp x1, EXT(ppl_no_exception_end)@page + add x1, x1, EXT(ppl_no_exception_end)@pageoff + + /* + * Ensure that the exception did not occur in the trampoline. If it + * did, we are either being attacked or our state machine is + * horrifically broken. + */ + mrs x2, ELR_EL1 + cmp x2, x0 + b.lo 1f + cmp x2, x1 + b.hi 1f + + /* We might be under attack; spin. */ + b . + +1: + /* Get the PPL save area. */ + mov x1, x3 + ldr x0, [x3, PMAP_CPU_DATA_SAVE_AREA] + + /* Save our x0, x1 state. */ + ldp x2, x3, [sp, SS64_X0] + stp x2, x3, [x0, SS64_X0] + + /* Restore SP1 to its original state. */ + mov x3, sp + add sp, sp, ARM_CONTEXT_SIZE + + .if $2 == SWITCH_TO_SP0 + /* Switch back to SP0. */ + msr SPSel, #0 + mov x2, sp + .else + /* Load the SP0 value. */ + mrs x2, SP_EL0 + .endif + + /* Save off the stack pointer. */ + str x2, [x0, SS64_SP] + + INIT_SAVED_STATE_FLAVORS x0, w1, w2 + + /* Save the context that was interrupted. */ + ldp x2, x3, [x3, SS64_X2] + stp fp, lr, [x0, SS64_FP] + SPILL_REGISTERS KERNEL_MODE + + /* + * Stash the function we wish to be invoked to deal with the exception; + * usually this is some preflight function for the fleh_* handler. + */ + adrp x25, $1@page + add x25, x25, $1@pageoff + + /* + * Indicate that this is a PPL exception, and that we should return to + * the PPL. + */ + mov x26, #1 + + /* Transition back to kernel mode. */ + mov x15, #PPL_EXIT_EXCEPTION + b ppl_return_to_kernel_mode +2: + /* Restore SP1 state. */ + ldp x2, x3, [sp, SS64_X2] + ldp x0, x1, [sp, SS64_X0] + add sp, sp, ARM_CONTEXT_SIZE + + /* Go to the specified label (usually the original exception vector). */ + b $0 +.endmacro +#endif /* __APRR_SUPPORTED__ */ + +#endif /* XNU_MONITOR */ #define CBF_DISABLE 0 #define CBF_ENABLE 1 @@ -239,6 +413,14 @@ Lel0_serror_vector_64: .endmacro el1_sp0_synchronous_vector_long: +#if XNU_MONITOR && __APRR_SUPPORTED__ + /* + * We do not have enough space for new instructions in this vector, so + * jump to outside code to check if this exception was taken in the PPL. + */ + b el1_sp0_synchronous_vector_ppl_check +Lel1_sp0_synchronous_vector_kernel: +#endif sub sp, sp, ARM_CONTEXT_SIZE // Make space on the exception stack stp x0, x1, [sp, SS64_X0] // Save x0, x1 to the stack mrs x1, ESR_EL1 // Get the exception syndrome @@ -261,6 +443,10 @@ Lkernel_stack_valid: b fleh_dispatch64 el1_sp0_irq_vector_long: +#if XNU_MONITOR && __APRR_SUPPORTED__ + EL1_SP0_VECTOR_PPL_CHECK el1_sp0_irq_vector_not_in_kernel_mode +Lel1_sp0_irq_vector_kernel: +#endif EL1_SP0_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] @@ -272,6 +458,10 @@ el1_sp0_irq_vector_long: el1_sp0_fiq_vector_long: // ARM64_TODO write optimized decrementer +#if XNU_MONITOR && __APRR_SUPPORTED__ + EL1_SP0_VECTOR_PPL_CHECK el1_sp0_fiq_vector_not_in_kernel_mode +Lel1_sp0_fiq_vector_kernel: +#endif EL1_SP0_VECTOR mrs x1, TPIDR_EL1 ldr x1, [x1, ACT_CPUDATAP] @@ -282,6 +472,10 @@ el1_sp0_fiq_vector_long: b fleh_dispatch64 el1_sp0_serror_vector_long: +#if XNU_MONITOR && __APRR_SUPPORTED__ + EL1_SP0_VECTOR_PPL_CHECK el1_sp0_serror_vector_not_in_kernel_mode +Lel1_sp0_serror_vector_kernel: +#endif EL1_SP0_VECTOR adrp x1, EXT(fleh_serror)@page // Load address for fleh add x1, x1, EXT(fleh_serror)@pageoff @@ -417,6 +611,13 @@ el0_serror_vector_64_long: add x1, x1, EXT(fleh_serror)@pageoff b fleh_dispatch64 +#if XNU_MONITOR && __APRR_SUPPORTED__ +el1_sp0_synchronous_vector_ppl_check: + EL1_SP0_VECTOR_PPL_CHECK el1_sp0_synchronous_vector_not_in_kernel_mode + + /* Jump back to the primary exception vector if we fell through. */ + b Lel1_sp0_synchronous_vector_kernel +#endif /* * check_exception_stack @@ -525,7 +726,7 @@ check_ktrr_sctlr_trap: b.ne Lel1_sp1_synchronous_vector_continue msr ELR_EL1, lr // Return to caller eret -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ /* 64-bit first level exception handler dispatcher. * Completes register context saving and branches to FLEH. @@ -571,7 +772,9 @@ fleh_dispatch64: mov x23, #0 mov x24, #0 mov x25, #0 +#if !XNU_MONITOR mov x26, #0 +#endif mov x27, #0 mov x28, #0 /* fp/lr already cleared by EL0_64_VECTOR */ @@ -580,6 +783,10 @@ fleh_dispatch64: mov x21, x0 // Copy arm_context_t pointer to x21 mov x22, x1 // Copy handler routine to x22 +#if XNU_MONITOR + /* Zero x26 to indicate that this should not return to the PPL. */ + mov x26, #0 +#endif #if !CONFIG_SKIP_PRECISE_USER_KERNEL_TIME tst x23, PSR64_MODE_EL_MASK // If any EL MODE bits are set, we're coming from @@ -620,6 +827,9 @@ Lvalid_link_register: bl EXT(sleh_synchronous) POP_FRAME +#if XNU_MONITOR + CHECK_EXCEPTION_RETURN_DISPATCH_PPL +#endif b exception_return_dispatch @@ -691,6 +901,9 @@ LEXT(fleh_irq) POP_FRAME END_INTERRUPT_HANDLER +#if XNU_MONITOR + CHECK_EXCEPTION_RETURN_DISPATCH_PPL +#endif b exception_return_dispatch @@ -710,6 +923,9 @@ LEXT(fleh_fiq) POP_FRAME END_INTERRUPT_HANDLER +#if XNU_MONITOR + CHECK_EXCEPTION_RETURN_DISPATCH_PPL +#endif b exception_return_dispatch @@ -724,6 +940,9 @@ LEXT(fleh_serror) bl EXT(sleh_serror) POP_FRAME +#if XNU_MONITOR + CHECK_EXCEPTION_RETURN_DISPATCH_PPL +#endif b exception_return_dispatch @@ -1048,6 +1267,18 @@ user_take_ast: user_set_debug_state_and_return: +#if defined(APPLELIGHTNING) +/* rdar://53177964 ([Cebu Errata SW WA][v8Debug] MDR NEX L3 clock turns OFF during restoreCheckpoint due to SWStep getting masked) */ + + ARM64_IS_PCORE x12 // if we're not a pCORE, also do nothing + cbz x12, 1f + + mrs x12, ARM64_REG_HID1 // if any debug session ever existed, set forceNexL3ClkOn + orr x12, x12, ARM64_REG_HID1_forceNexL3ClkOn + msr ARM64_REG_HID1, x12 +1: + +#endif ldr x4, [x3, ACT_CPUDATAP] // Get current CPU data pointer isb // Synchronize context @@ -1111,11 +1342,577 @@ L_preempt_count_notzero_str: LEXT(ExceptionVectorsEnd) #endif /* __ARM_KERNEL_PROTECT__ */ +#if XNU_MONITOR +#if __APRR_SUPPORTED__ + .text + .align 2 +el1_sp0_synchronous_vector_not_in_kernel_mode: + EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_synchronous_vector_kernel, fleh_synchronous_from_ppl, STAY_ON_SP1 + + .text + .align 2 +el1_sp0_fiq_vector_not_in_kernel_mode: + EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_fiq_vector_kernel, fleh_fiq_from_ppl, SWITCH_TO_SP0 + + .text + .align 2 +el1_sp0_irq_vector_not_in_kernel_mode: + EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_irq_vector_kernel, fleh_irq_from_ppl, SWITCH_TO_SP0 + + .text + .align 2 +el1_sp0_serror_vector_not_in_kernel_mode: + EL1_SP0_VECTOR_NOT_IN_KERNEL_MODE Lel1_sp0_serror_vector_kernel, fleh_serror_from_ppl, SWITCH_TO_SP0 +#endif /* __APRR_SUPPORTED__ */ + +/* + * Functions to preflight the fleh handlers when the PPL has taken an exception; + * mostly concerned with setting up state for the normal fleh code. + */ +fleh_synchronous_from_ppl: + /* Save x0. */ + mov x15, x0 + + /* Grab the ESR. */ + mrs x1, ESR_EL1 // Get the exception syndrome + + /* If the stack pointer is corrupt, it will manifest either as a data abort + * (syndrome 0x25) or a misaligned pointer (syndrome 0x26). We can check + * these quickly by testing bit 5 of the exception class. + */ + tbz x1, #(5 + ESR_EC_SHIFT), Lvalid_ppl_stack + mrs x0, SP_EL0 // Get SP_EL0 + + /* Perform high level checks for stack corruption. */ + and x1, x1, #ESR_EC_MASK // Mask the exception class + mov x2, #(ESR_EC_SP_ALIGN << ESR_EC_SHIFT) + cmp x1, x2 // If we have a stack alignment exception + b.eq Lcorrupt_ppl_stack // ...the stack is definitely corrupted + mov x2, #(ESR_EC_DABORT_EL1 << ESR_EC_SHIFT) + cmp x1, x2 // If we have a data abort, we need to + b.ne Lvalid_ppl_stack // ...validate the stack pointer + +Ltest_pstack: + /* Bounds check the PPL stack. */ + adrp x10, EXT(pmap_stacks_start)@page + ldr x10, [x10, #EXT(pmap_stacks_start)@pageoff] + adrp x11, EXT(pmap_stacks_end)@page + ldr x11, [x11, #EXT(pmap_stacks_end)@pageoff] + cmp x0, x10 + b.lo Lcorrupt_ppl_stack + cmp x0, x11 + b.hi Lcorrupt_ppl_stack + +Lvalid_ppl_stack: + /* Restore x0. */ + mov x0, x15 + + /* Switch back to the kernel stack. */ + msr SPSel, #0 + GET_PMAP_CPU_DATA x5, x6, x7 + ldr x6, [x5, PMAP_CPU_DATA_KERN_SAVED_SP] + mov sp, x6 + + /* Hand off to the synch handler. */ + b EXT(fleh_synchronous) + +Lcorrupt_ppl_stack: + /* Restore x0. */ + mov x0, x15 + + /* Hand off to the invalid stack handler. */ + b fleh_invalid_stack + +fleh_fiq_from_ppl: + mrs x1, TPIDR_EL1 + ldr x1, [x1, ACT_CPUDATAP] + ldr x1, [x1, CPU_ISTACKPTR] + mov sp, x1 + b EXT(fleh_fiq) + +fleh_irq_from_ppl: + mrs x1, TPIDR_EL1 + ldr x1, [x1, ACT_CPUDATAP] + ldr x1, [x1, CPU_ISTACKPTR] + mov sp, x1 + b EXT(fleh_irq) + +fleh_serror_from_ppl: + GET_PMAP_CPU_DATA x5, x6, x7 + ldr x6, [x5, PMAP_CPU_DATA_KERN_SAVED_SP] + mov sp, x6 + b EXT(fleh_serror) + +/* + * REENABLE_DAIF + * + * Restores the DAIF bits to their original state (well, the AIF bits at least). + * arg0 - DAIF bits (read from the DAIF interface) to restore + */ +.macro REENABLE_DAIF + /* AIF enable. */ + tst $0, #(DAIF_IRQF | DAIF_FIQF | DAIF_ASYNCF) + b.eq 3f + + /* IF enable. */ + tst $0, #(DAIF_IRQF | DAIF_FIQF) + b.eq 2f + + /* A enable. */ + tst $0, #(DAIF_ASYNCF) + b.eq 1f + + /* Enable nothing. */ + b 4f + + /* A enable. */ +1: + msr DAIFClr, #(DAIFSC_ASYNCF) + b 4f + + /* IF enable. */ +2: + msr DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF) + b 4f + + /* AIF enable. */ +3: + msr DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF) + + /* Done! */ +4: +.endmacro + + +#if XNU_MONITOR && __APRR_SUPPORTED__ +/* + * aprr_ppl_enter + * + * Invokes the PPL + * x15 - The index of the requested PPL function. + */ + .text + .align 2 + .globl EXT(aprr_ppl_enter) +LEXT(aprr_ppl_enter) + /* Push a frame. */ + ARM64_STACK_PROLOG + stp x20, x21, [sp, #-0x20]! + stp x29, x30, [sp, #0x10] + add x29, sp, #0x10 + + /* Increase the preemption count. */ + mrs x10, TPIDR_EL1 + ldr w12, [x10, ACT_PREEMPT_CNT] + add w12, w12, #1 + str w12, [x10, ACT_PREEMPT_CNT] + + /* Is the PPL currently locked down? */ + adrp x13, EXT(pmap_ppl_locked_down)@page + add x13, x13, EXT(pmap_ppl_locked_down)@pageoff + ldr w14, [x13] + cmp w14, wzr + + /* If not, just perform the call in the current context. */ + b.eq EXT(ppl_bootstrap_dispatch) + + mov w10, #PPL_STATE_KERNEL + b Ldisable_aif_and_enter_ppl + + /* We align this to land the next few instructions on their own page. */ + .section __PPLTRAMP,__text,regular,pure_instructions + .align 14 + .space (16*1024)-(4*8) // 8 insns + + /* + * This label is used by exception handlers that are trying to return + * to the PPL. + */ +Ldisable_aif_and_enter_ppl: + /* We must trampoline to the PPL context; disable AIF. */ + mrs x20, DAIF + msr DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) + + .globl EXT(ppl_no_exception_start) +LEXT(ppl_no_exception_start) + /* Switch APRR_EL1 to PPL mode. */ + MOV64 x14, APRR_EL1_PPL + msr APRR_EL1, x14 + + /* This ISB should be the last instruction on a page. */ + // TODO: can we static assert this? + isb +#endif /* XNU_MONITOR && __APRR_SUPPORTED__ */ + + + // x15: ppl call number + // w10: ppl_state + // x20: gxf_enter caller's DAIF + .globl EXT(ppl_trampoline_start) +LEXT(ppl_trampoline_start) + +#if __APRR_SUPPORTED__ + /* Squash AIF AGAIN, because someone may have attacked us. */ + msr DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) +#endif /* __APRR_SUPPORTED__ */ + +#if __APRR_SUPPORTED__ + /* Verify the state of APRR_EL1. */ + MOV64 x14, APRR_EL1_PPL + mrs x21, APRR_EL1 +#else /* __APRR_SUPPORTED__ */ +#error "XPRR configuration error" +#endif /* __APRR_SUPPORTED__ */ + cmp x14, x21 + b.ne Lppl_fail_dispatch + + /* Verify the request ID. */ + cmp x15, PMAP_COUNT + b.hs Lppl_fail_dispatch + + /* Get the PPL CPU data structure. */ + GET_PMAP_CPU_DATA x12, x13, x14 + + /* Mark this CPU as being in the PPL. */ + ldr w9, [x12, PMAP_CPU_DATA_PPL_STATE] + + cmp w9, #PPL_STATE_KERNEL + b.eq Lppl_mark_cpu_as_dispatching + + /* Check to see if we are trying to trap from within the PPL. */ + cmp w9, #PPL_STATE_DISPATCH + b.eq Lppl_fail_dispatch_ppl + + + /* Ensure that we are returning from an exception. */ + cmp w9, #PPL_STATE_EXCEPTION + b.ne Lppl_fail_dispatch + + // where is w10 set? + // in CHECK_EXCEPTION_RETURN_DISPATCH_PPL + cmp w10, #PPL_STATE_EXCEPTION + b.ne Lppl_fail_dispatch + + /* This is an exception return; set the CPU to the dispatching state. */ + mov w9, #PPL_STATE_DISPATCH + str w9, [x12, PMAP_CPU_DATA_PPL_STATE] + + /* Find the save area, and return to the saved PPL context. */ + ldr x0, [x12, PMAP_CPU_DATA_SAVE_AREA] + mov sp, x0 +#if __APRR_SUPPORTED__ + b Lexception_return_restore_registers +#else + b EXT(return_to_ppl) +#endif /* __APRR_SUPPORTED__ */ + +Lppl_mark_cpu_as_dispatching: + cmp w10, #PPL_STATE_KERNEL + b.ne Lppl_fail_dispatch + + /* Mark the CPU as dispatching. */ + mov w13, #PPL_STATE_DISPATCH + str w13, [x12, PMAP_CPU_DATA_PPL_STATE] + + /* Get the handler for the request */ + adrp x9, EXT(ppl_handler_table)@page + add x9, x9, EXT(ppl_handler_table)@pageoff + ldr x10, [x9, x15, lsl #3] + + /* Switch to the regular PPL stack. */ + // TODO: switch to PPL_STACK earlier in gxf_ppl_entry_handler + ldr x9, [x12, PMAP_CPU_DATA_PPL_STACK] + + // SP0 is thread stack here + mov x21, sp + // SP0 is now PPL stack + mov sp, x9 + + + /* Save the old stack pointer off in case we need it. */ + str x21, [x12, PMAP_CPU_DATA_KERN_SAVED_SP] + + /* Branch to the code that will invoke the PPL request. */ + b EXT(ppl_dispatch) + +Lppl_fail_dispatch_ppl: + /* Switch back to the kernel stack. */ + ldr x10, [x12, PMAP_CPU_DATA_KERN_SAVED_SP] + mov sp, x10 + +Lppl_fail_dispatch: + /* Indicate that we failed. */ + mov x15, #PPL_EXIT_BAD_CALL + + /* Move the DAIF bits into the expected register. */ + mov x10, x20 + + /* Return to kernel mode. */ + b ppl_return_to_kernel_mode + +Lppl_dispatch_exit: + /* Indicate that we are cleanly exiting the PPL. */ + mov x15, #PPL_EXIT_DISPATCH + + /* Switch back to the original (kernel thread) stack. */ + mov sp, x21 + + /* Move the saved DAIF bits. */ + mov x10, x20 + + /* Clear the old stack pointer. */ + str xzr, [x12, PMAP_CPU_DATA_KERN_SAVED_SP] + + /* + * Mark the CPU as no longer being in the PPL. We spin if our state + * machine is broken. + */ + ldr w9, [x12, PMAP_CPU_DATA_PPL_STATE] + cmp w9, #PPL_STATE_DISPATCH + b.ne . + mov w9, #PPL_STATE_KERNEL + str w9, [x12, PMAP_CPU_DATA_PPL_STATE] + + /* Return to the kernel. */ + b ppl_return_to_kernel_mode + +#if __APRR_SUPPORTED__ + /* We align this to land the next few instructions on their own page. */ + .align 14 + .space (16*1024)-(4*5) // 5 insns + +ppl_return_to_kernel_mode: + /* Switch APRR_EL1 back to the kernel mode. */ + // must be 5 instructions + MOV64 x14, APRR_EL1_DEFAULT + msr APRR_EL1, x14 + + .globl EXT(ppl_trampoline_end) +LEXT(ppl_trampoline_end) + + /* This should be the first instruction on a page. */ + isb + + .globl EXT(ppl_no_exception_end) +LEXT(ppl_no_exception_end) + b ppl_exit +#endif /* __APRR_SUPPORTED__ */ + + + .text +ppl_exit: + /* + * If we are dealing with an exception, hand off to the first level + * exception handler. + */ + cmp x15, #PPL_EXIT_EXCEPTION + b.eq Ljump_to_fleh_handler + + /* Restore the original AIF state. */ + REENABLE_DAIF x10 + + /* If this was a panic call from the PPL, reinvoke panic. */ + cmp x15, #PPL_EXIT_PANIC_CALL + b.eq Ljump_to_panic_trap_to_debugger + + /* Load the preemption count. */ + mrs x10, TPIDR_EL1 + ldr w12, [x10, ACT_PREEMPT_CNT] + + /* Detect underflow */ + cbnz w12, Lno_preempt_underflow + b preempt_underflow +Lno_preempt_underflow: + + /* Lower the preemption count. */ + sub w12, w12, #1 + str w12, [x10, ACT_PREEMPT_CNT] + + /* Skip ASTs if the peemption count is not zero. */ + cbnz x12, Lppl_skip_ast_taken + + /* Skip the AST check if interrupts are disabled. */ + mrs x1, DAIF + tst x1, #DAIF_IRQF + b.ne Lppl_skip_ast_taken + + /* Disable interrupts. */ + msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF) + + /* IF there is no urgent AST, skip the AST. */ + ldr x12, [x10, ACT_CPUDATAP] + ldr x14, [x12, CPU_PENDING_AST] + tst x14, AST_URGENT + b.eq Lppl_defer_ast_taken + + /* Stash our return value and return reason. */ + mov x20, x0 + mov x21, x15 + + /* Handle the AST. */ + bl EXT(ast_taken_kernel) + + /* Restore the return value and the return reason. */ + mov x15, x21 + mov x0, x20 + +Lppl_defer_ast_taken: + /* Reenable interrupts. */ + msr DAIFClr, #(DAIFSC_IRQF | DAIFSC_FIQF) + +Lppl_skip_ast_taken: + /* Pop the stack frame. */ + ldp x29, x30, [sp, #0x10] + ldp x20, x21, [sp], #0x20 + + /* Check to see if this was a bad request. */ + cmp x15, #PPL_EXIT_BAD_CALL + b.eq Lppl_bad_call + + /* Return. */ + ARM64_STACK_EPILOG + + .align 2 +Ljump_to_fleh_handler: + br x25 + + .align 2 +Ljump_to_panic_trap_to_debugger: + b EXT(panic_trap_to_debugger) + +Lppl_bad_call: + /* Panic. */ + adrp x0, Lppl_bad_call_panic_str@page + add x0, x0, Lppl_bad_call_panic_str@pageoff + b EXT(panic) + + .text + .align 2 + .globl EXT(ppl_dispatch) +LEXT(ppl_dispatch) + /* + * Save a couple of important registers (implementation detail; x12 has + * the PPL per-CPU data address; x13 is not actually interesting). + */ + stp x12, x13, [sp, #-0x10]! + + /* Restore the original AIF state. */ + REENABLE_DAIF x20 + + /* + * Note that if the method is NULL, we'll blow up with a prefetch abort, + * but the exception vectors will deal with this properly. + */ + + /* Invoke the PPL method. */ +#ifdef HAS_APPLE_PAC + blraaz x10 +#else + blr x10 +#endif + + /* Disable AIF. */ + msr DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) + + /* Restore those important registers. */ + ldp x12, x13, [sp], #0x10 + + /* Mark this as a regular return, and hand off to the return path. */ + b Lppl_dispatch_exit + + .text + .align 2 + .globl EXT(ppl_bootstrap_dispatch) +LEXT(ppl_bootstrap_dispatch) + /* Verify the PPL request. */ + cmp x15, PMAP_COUNT + b.hs Lppl_fail_bootstrap_dispatch + + /* Get the requested PPL routine. */ + adrp x9, EXT(ppl_handler_table)@page + add x9, x9, EXT(ppl_handler_table)@pageoff + ldr x10, [x9, x15, lsl #3] + + /* Invoke the requested PPL routine. */ +#ifdef HAS_APPLE_PAC + blraaz x10 +#else + blr x10 +#endif + /* Stash off the return value */ + mov x20, x0 + /* Drop the preemption count */ + bl EXT(_enable_preemption) + mov x0, x20 + + /* Pop the stack frame. */ + ldp x29, x30, [sp, #0x10] + ldp x20, x21, [sp], #0x20 +#if __has_feature(ptrauth_returns) + retab +#else + ret +#endif + +Lppl_fail_bootstrap_dispatch: + /* Pop our stack frame and panic. */ + ldp x29, x30, [sp, #0x10] + ldp x20, x21, [sp], #0x20 +#if __has_feature(ptrauth_returns) + autibsp +#endif + adrp x0, Lppl_bad_call_panic_str@page + add x0, x0, Lppl_bad_call_panic_str@pageoff + b EXT(panic) + + .text + .align 2 + .globl EXT(ml_panic_trap_to_debugger) +LEXT(ml_panic_trap_to_debugger) +#if 0 + // TODO: why would we ever want to turn interrupts back on after going down panic path? + /* Grab the current AIF state, and disable AIF. */ + mrs x10, DAIF +#endif + msr DAIFSet, #(DAIFSC_ASYNCF | DAIFSC_IRQF | DAIFSC_FIQF) + + // we want interrupts to stay masked after exiting PPL when calling into panic to halt system + // x10 is used in ppl_return_to_kernel_mode restore desired DAIF state after GEXIT + mrs x10, DAIF + + /* Indicate (for the PPL->kernel transition) that we are panicking. */ + mov x15, #PPL_EXIT_PANIC_CALL + + /* Get the PPL per-CPU data. */ + GET_PMAP_CPU_DATA x11, x12, x13 + + /* Restore the old stack pointer as we can't push onto PPL stack after we exit PPL */ + ldr x12, [x11, PMAP_CPU_DATA_KERN_SAVED_SP] + mov sp, x12 + + /* + * Mark this CPU as being in the PPL. Halt and catch fire if our state + * machine appears to be broken. + */ + ldr w12, [x11, PMAP_CPU_DATA_PPL_STATE] + cmp w12, #PPL_STATE_DISPATCH + b.ne . + mov w13, #PPL_STATE_PANIC + str w13, [x11, PMAP_CPU_DATA_PPL_STATE] + + /* Now we are ready to exit the PPL. */ + b ppl_return_to_kernel_mode + + .data +Lppl_bad_call_panic_str: + .asciz "ppl_dispatch: failed due to bad arguments/state" +#else /* XNU_MONITOR */ .text .align 2 .globl EXT(ml_panic_trap_to_debugger) LEXT(ml_panic_trap_to_debugger) ret +#endif /* XNU_MONITOR */ /* ARM64_TODO Is globals_asm.h needed? */ //#include "globals_asm.h" diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index 13aca14c1..037f34c13 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -56,7 +56,7 @@ #include -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) #include #endif @@ -66,6 +66,9 @@ #include #endif +#if HAS_CLUSTER +static uint8_t cluster_initialized = 0; +#endif static int max_cpus_initialized = 0; @@ -90,6 +93,11 @@ extern vm_offset_t segLOWESTTEXT; extern vm_offset_t segLASTB; extern unsigned long segSizeLAST; +#if defined(HAS_IPI) +unsigned int gFastIPI = 1; +#define kDeferredIPITimerDefault (64 * NSEC_PER_USEC) /* in nanoseconds */ +static uint64_t deferred_ipi_timer_ns = kDeferredIPITimerDefault; +#endif /* defined(HAS_IPI) */ void machine_conf(void); @@ -113,40 +121,112 @@ void ml_lockdown_init(void); void ml_lockdown_run_handler(void); uint32_t get_arm_cpu_version(void); +#if defined(HAS_IPI) +static inline void +ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type) +{ +#if HAS_CLUSTER + uint64_t local_mpidr; + /* NOTE: this logic expects that we are called in a non-preemptible + * context, or at least one in which the calling thread is bound + * to a single CPU. Otherwise we may migrate between choosing which + * IPI mechanism to use and issuing the IPI. */ + MRS(local_mpidr, "MPIDR_EL1"); + if ((local_mpidr & MPIDR_AFF1_MASK) == (cpu_mpidr & MPIDR_AFF1_MASK)) { + uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK); + MSR(ARM64_REG_IPI_RR_LOCAL, x); + } else { + #define IPI_RR_TARGET_CLUSTER_SHIFT 16 + uint64_t x = type | ((cpu_mpidr & MPIDR_AFF1_MASK) << (IPI_RR_TARGET_CLUSTER_SHIFT - MPIDR_AFF1_SHIFT)) | (cpu_mpidr & MPIDR_AFF0_MASK); + MSR(ARM64_REG_IPI_RR_GLOBAL, x); + } +#else + uint64_t x = type | (cpu_mpidr & MPIDR_AFF0_MASK); + MSR(ARM64_REG_IPI_RR, x); +#endif +} +#endif +#if !defined(HAS_IPI) __dead2 +#endif void ml_cpu_signal(unsigned int cpu_mpidr __unused) { +#if defined(HAS_IPI) + ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_IMMEDIATE); +#else panic("Platform does not support ACC Fast IPI"); +#endif } +#if !defined(HAS_IPI) __dead2 +#endif void ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs) { +#if defined(HAS_IPI) + /* adjust IPI_CR timer countdown value for deferred IPI + * accepts input in nanosecs, convert to absolutetime (REFCLK ticks), + * clamp maximum REFCLK ticks to 0xFFFF (16 bit field) + * + * global register, should only require a single write to update all + * CPU cores: from Skye ACC user spec section 5.7.3.3 + * + * IPICR is a global register but there are two copies in ACC: one at pBLK and one at eBLK. + * IPICR write SPR token also traverses both pCPM and eCPM rings and updates both copies. + */ + uint64_t abstime; + + nanoseconds_to_absolutetime(nanosecs, &abstime); + + abstime = MIN(abstime, 0xFFFF); + + /* update deferred_ipi_timer_ns with the new clamped value */ + absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns); + + MSR(ARM64_REG_IPI_CR, abstime); +#else (void)nanosecs; panic("Platform does not support ACC Fast IPI"); +#endif } uint64_t ml_cpu_signal_deferred_get_timer() { +#if defined(HAS_IPI) + return deferred_ipi_timer_ns; +#else return 0; +#endif } +#if !defined(HAS_IPI) __dead2 +#endif void ml_cpu_signal_deferred(unsigned int cpu_mpidr __unused) { +#if defined(HAS_IPI) + ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_DEFERRED); +#else panic("Platform does not support ACC Fast IPI deferral"); +#endif } +#if !defined(HAS_IPI) __dead2 +#endif void ml_cpu_signal_retract(unsigned int cpu_mpidr __unused) { +#if defined(HAS_IPI) + ml_cpu_signal_type(cpu_mpidr, ARM64_REG_IPI_RR_TYPE_RETRACT); +#else panic("Platform does not support ACC Fast IPI retraction"); +#endif } void @@ -241,7 +321,11 @@ get_arm_cpu_version(void) boolean_t user_cont_hwclock_allowed(void) { +#if HAS_CONTINUOUS_HWCLOCK + return TRUE; +#else return FALSE; +#endif } @@ -257,7 +341,7 @@ arm64_wfe_allowed(void) return TRUE; } -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) uint64_t rorgn_begin __attribute__((section("__DATA, __const"))) = 0; uint64_t rorgn_end __attribute__((section("__DATA, __const"))) = 0; @@ -307,6 +391,11 @@ rorgn_stash_range(void) rc = DTGetProperty(entryP, "reg", (void **)®_prop, &prop_size); assert(rc == kSuccess); amcc_base = ml_io_map(soc_base + *reg_prop, *(reg_prop + 1)); +#elif defined(KERNEL_INTEGRITY_CTRR) + /* TODO: t8020 mcc entry not in device tree yet; we'll do it LIVE */ +#define TEMP_AMCC_BASE_PA 0x200000000ULL +#define TEMP_AMCC_SZ 0x100000 + amcc_base = ml_io_map(TEMP_AMCC_BASE_PA, TEMP_AMCC_SZ); #else #error "KERNEL_INTEGRITY config error" #endif @@ -315,6 +404,27 @@ rorgn_stash_range(void) assert(rRORGNENDADDR > rRORGNBASEADDR); rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base; rorgn_end = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base; +#elif defined(KERNEL_INTEGRITY_CTRR) + rorgn_begin = rCTRR_AMCC_PLANE_REG(0, CTRR_A_BASEADDR); + rorgn_end = rCTRR_AMCC_PLANE_REG(0, CTRR_A_ENDADDR); + assert(rorgn_end > rorgn_begin); + + for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { + uint32_t begin = rCTRR_AMCC_PLANE_REG(i, CTRR_A_BASEADDR); + uint32_t end = rCTRR_AMCC_PLANE_REG(i, CTRR_A_ENDADDR); + if (!(begin == rorgn_begin && end == rorgn_end)) { +#if DEVELOPMENT || DEBUG + panic("iboot programmed CTRR bounds are inconsistent"); +#else + panic("Inconsistent memory configuration"); +#endif + } + } + + // convert from page number from DRAM base to PA + rorgn_begin = (rorgn_begin << AMCC_PGSHIFT) + dram_base; + rorgn_end = (rorgn_end << AMCC_PGSHIFT) + dram_base; + #else #error KERNEL_INTEGRITY config error #endif /* defined (KERNEL_INTEGRITY_KTRR) */ @@ -330,6 +440,11 @@ assert_unlocked() #if defined(KERNEL_INTEGRITY_KTRR) rorgn_lock = rRORGNLOCK; ktrr_lock = __builtin_arm_rsr64(ARM64_REG_KTRR_LOCK_EL1); +#elif defined(KERNEL_INTEGRITY_CTRR) + for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { + rorgn_lock |= rCTRR_AMCC_PLANE_REG(i, CTRR_A_LOCK); + } + ktrr_lock = __builtin_arm_rsr64(ARM64_REG_CTRR_LOCK_EL1); #else #error KERNEL_INTEGRITY config error #endif /* defined(KERNEL_INTEGRITY_KTRR) */ @@ -344,6 +459,13 @@ lock_amcc() #if defined(KERNEL_INTEGRITY_KTRR) rRORGNLOCK = 1; __builtin_arm_isb(ISB_SY); +#elif defined(KERNEL_INTEGRITY_CTRR) + /* lockdown planes in reverse order as plane 0 should be locked last */ + for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { + rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_ENABLE) = 1; + rCTRR_AMCC_PLANE_REG(CTRR_AMCC_MAX_PLANES - i - 1, CTRR_A_LOCK) = 1; + __builtin_arm_isb(ISB_SY); + } #else #error KERNEL_INTEGRITY config error #endif @@ -363,6 +485,37 @@ lock_mmu(uint64_t begin, uint64_t end) __builtin_arm_isb(ISB_SY); flush_mmu_tlb(); +#elif defined (KERNEL_INTEGRITY_CTRR) + /* this will lock the entire bootstrap cluster. non bootstrap clusters + * will be locked by respective cluster master in start.s */ + + __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin); + __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end); + +#if !defined(APPLEVORTEX) + /* H12 changed sequence, must invalidate TLB immediately after setting CTRR bounds */ + __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ + flush_mmu_tlb(); +#endif /* !defined(APPLEVORTEX) */ + + __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); + __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL); + + uint64_t current_el = __builtin_arm_rsr64("CurrentEL"); + if (current_el == PSR64_MODE_EL2) { + // CTRR v2 has explicit registers for cluster config. they can only be written in EL2 + + __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin); + __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end); + __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); + __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL); + } + + __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ +#if defined(APPLEVORTEX) + flush_mmu_tlb(); +#endif /* defined(APPLEVORTEX) */ + #else /* defined(KERNEL_INTEGRITY_KTRR) */ #error KERNEL_INTEGRITY config error #endif /* defined(KERNEL_INTEGRITY_KTRR) */ @@ -373,6 +526,17 @@ assert_amcc_cache_disabled() { #if defined(KERNEL_INTEGRITY_KTRR) assert((rMCCGEN & 1) == 0); /* assert M$ disabled or LLC clean will be unreliable */ +#elif defined(KERNEL_INTEGRITY_CTRR) && (defined(ARM64_BOARD_CONFIG_T8006)) + /* + * T8006 differentiates between data and tag ways being powered up, so + * make sure to check that both are zero on its single memory plane. + */ + assert((rCTRR_AMCC_PLANE_REG(0, CTRR_AMCC_PWRONWAYCNTSTATUS) & + (AMCC_CURTAGWAYCNT_MASK | AMCC_CURDATWAYCNT_MASK)) == 0); +#elif defined (KERNEL_INTEGRITY_CTRR) + for (int i = 0; i < CTRR_AMCC_MAX_PLANES; ++i) { + assert(rCTRR_AMCC_PLANE_REG(i, CTRR_AMCC_WAYONCNT) == 0); + } #else #error KERNEL_INTEGRITY config error #endif @@ -423,6 +587,11 @@ rorgn_lockdown(void) assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz)); /* assert that __LAST segment containing privileged insns is only a single page */ assert(last_segsz == PAGE_SIZE); +#elif defined(KERNEL_INTEGRITY_CTRR) + ktrr_end = (ktrr_end + last_segsz - 1) & ~AMCC_PGMASK; + /* __LAST is part of MMU CTRR region. Can't use the KTRR style method of making + * __pinst no execute because PXN applies with MMU off in CTRR. */ + assert(rorgn_begin == ktrr_begin && rorgn_end == ktrr_end); #endif @@ -445,17 +614,38 @@ rorgn_lockdown(void) out: #endif +#if defined(KERNEL_INTEGRITY_CTRR) + { + /* wake any threads blocked on cluster master lockdown */ + cpu_data_t *cdp; + uint64_t mpidr_el1_value; + + cdp = getCpuDatap(); + MRS(mpidr_el1_value, "MPIDR_EL1"); + cdp->cpu_cluster_id = (mpidr_el1_value & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT; + assert(cdp->cpu_cluster_id < __ARM_CLUSTER_COUNT__); + ctrr_cluster_locked[cdp->cpu_cluster_id] = 1; + thread_wakeup(&ctrr_cluster_locked[cdp->cpu_cluster_id]); + } +#endif /* now we can run lockdown handler */ ml_lockdown_run_handler(); } -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ void machine_startup(__unused boot_args * args) { int boot_arg; +#if defined(HAS_IPI) && (DEVELOPMENT || DEBUG) + if (!PE_parse_boot_argn("fastipi", &gFastIPI, sizeof(gFastIPI))) { + gFastIPI = 1; + } + + PE_parse_boot_argn("fastipitimeout", &deferred_ipi_timer_ns, sizeof(deferred_ipi_timer_ns)); +#endif /* defined(HAS_IPI) && (DEVELOPMENT || DEBUG)*/ #if CONFIG_NONFATAL_ASSERTS PE_parse_boot_argn("assert", &mach_assert, sizeof(mach_assert)); @@ -484,7 +674,7 @@ machine_lockdown_preflight(void) { #if CONFIG_KERNEL_INTEGRITY -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) rorgn_stash_range(); #endif @@ -508,8 +698,11 @@ machine_lockdown(void) #endif #endif /* KERNEL_INTEGRITY_WT */ +#if XNU_MONITOR + pmap_lockdown_ppl(); +#endif -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* KTRR * * Lock physical KTRR region. KTRR region is read-only. Memory outside @@ -517,7 +710,7 @@ machine_lockdown(void) */ rorgn_lockdown(); -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ #endif /* CONFIG_KERNEL_INTEGRITY */ @@ -819,6 +1012,16 @@ ml_install_interrupt_handler( void ml_init_interrupt(void) { +#if defined(HAS_IPI) + /* + * ml_init_interrupt will get called once for each CPU, but this is redundant + * because there is only one global copy of the register for skye. do it only + * on the bootstrap cpu + */ + if (getCpuDatap()->cluster_master) { + ml_cpu_signal_deferred_adjust_timer(deferred_ipi_timer_ns); + } +#endif } /* @@ -960,6 +1163,9 @@ ml_lockdown_init() lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL); +#if defined(KERNEL_INTEGRITY_CTRR) + init_ctrr_cpu_start_lock(); +#endif } kern_return_t @@ -973,7 +1179,7 @@ ml_lockdown_handler_register(lockdown_handler_t f, void *this) lockdown_handler = f; lockdown_this = this; -#if !(defined(KERNEL_INTEGRITY_KTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) lockdown_done = 1; lockdown_handler(this); #else @@ -1063,7 +1269,11 @@ ml_processor_register(ml_processor_info_t *in_processor_info, this_cpu_datap->cpu_l3_id = in_processor_info->l3_cache_id; this_cpu_datap->cpu_l3_size = in_processor_info->l3_cache_size; +#if HAS_CLUSTER + this_cpu_datap->cluster_master = !OSTestAndSet(this_cpu_datap->cpu_cluster_id, &cluster_initialized); +#else /* HAS_CLUSTER */ this_cpu_datap->cluster_master = is_boot_cpu; +#endif /* HAS_CLUSTER */ pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor)); assert(pset != NULL); @@ -1288,6 +1498,10 @@ ml_static_protect( pt_entry_t *pte_p; pt_entry_t ptmp; +#if XNU_MONITOR + assert(!TEST_PAGE_RATIO_4); + assert(!pmap_is_monitor(ppn)); +#endif tte2 = arm_kva_to_tte(vaddr_cur); @@ -1668,6 +1882,13 @@ boolean_t ml_wants_panic_trap_to_debugger(void) { boolean_t result = FALSE; +#if XNU_MONITOR + /* + * This looks racey, but if we are in the PPL, preemption will be + * disabled. + */ + result = ((pmap_get_cpu_data()->ppl_state == PPL_STATE_DISPATCH) && pmap_ppl_locked_down); +#endif return result; } diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 5dc6cde73..9d41431fe 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -58,7 +58,62 @@ LEXT(ml_set_kernelkey_enabled) #endif /* defined(HAS_APPLE_PAC) */ +#if HAS_BP_RET +/* + * void set_bp_ret(void) + * Helper function to enable branch predictor state retention + * across ACC sleep + */ + + .align 2 + .globl EXT(set_bp_ret) +LEXT(set_bp_ret) + // Load bpret boot-arg + adrp x14, EXT(bp_ret)@page + add x14, x14, EXT(bp_ret)@pageoff + ldr w14, [x14] + + mrs x13, ARM64_REG_ACC_CFG + and x13, x13, (~(ARM64_REG_ACC_CFG_bpSlp_mask << ARM64_REG_ACC_CFG_bpSlp_shift)) + and x14, x14, #(ARM64_REG_ACC_CFG_bpSlp_mask) + orr x13, x13, x14, lsl #(ARM64_REG_ACC_CFG_bpSlp_shift) + msr ARM64_REG_ACC_CFG, x13 + + ret +#endif // HAS_BP_RET + +#if HAS_NEX_PG + .align 2 + .globl EXT(set_nex_pg) +LEXT(set_nex_pg) + mrs x14, MPIDR_EL1 + // Skip if this isn't a p-core; NEX powergating isn't available for e-cores + and x14, x14, #(MPIDR_PNE) + cbz x14, Lnex_pg_done + + // Set the SEG-recommended value of 12 additional reset cycles + mrs x14, ARM64_REG_HID13 + and x14, x14, (~ARM64_REG_HID13_RstCyc_mask) + orr x14, x14, ARM64_REG_HID13_RstCyc_val + msr ARM64_REG_HID13, x14 + + // Load nexpg boot-arg + adrp x14, EXT(nex_pg)@page + add x14, x14, EXT(nex_pg)@pageoff + ldr w14, [x14] + + mrs x13, ARM64_REG_HID14 + and x13, x13, (~ARM64_REG_HID14_NexPwgEn) + cbz w14, Lset_nex_pg + orr x13, x13, ARM64_REG_HID14_NexPwgEn +Lset_nex_pg: + msr ARM64_REG_HID14, x13 + +Lnex_pg_done: + ret + +#endif // HAS_NEX_PG /* uint32_t get_fpscr(void): * Returns (FPSR | FPCR). @@ -168,12 +223,21 @@ LEXT(set_mmu_ttb_alternate) bl EXT(pinst_set_ttbr1) mov lr, x1 #else +#if defined(HAS_VMSA_LOCK) + mrs x1, ARM64_REG_VMSA_LOCK_EL1 + and x1, x1, #(VMSA_LOCK_TTBR1_EL1) + cbnz x1, L_set_locked_reg_panic +#endif /* defined(HAS_VMSA_LOCK) */ msr TTBR1_EL1, x0 #endif /* defined(KERNEL_INTEGRITY_KTRR) */ isb sy ret +#if XNU_MONITOR + .section __PPLTEXT,__text,regular,pure_instructions +#else .text +#endif .align 2 .globl EXT(set_mmu_ttb) LEXT(set_mmu_ttb) @@ -211,6 +275,19 @@ LEXT(set_vbar_el1) #endif #endif /* __ARM_KERNEL_PROTECT__ */ +#if defined(HAS_VMSA_LOCK) + .text + .align 2 + .globl EXT(vmsa_lock) +LEXT(vmsa_lock) + isb sy + mov x1, #(VMSA_LOCK_SCTLR_M_BIT) + mov x0, #(VMSA_LOCK_TTBR1_EL1 | VMSA_LOCK_TCR_EL1 | VMSA_LOCK_VBAR_EL1) + orr x0, x0, x1 + msr ARM64_REG_VMSA_LOCK_EL1, x0 + isb sy + ret +#endif /* defined(HAS_VMSA_LOCK) */ /* * set translation control register @@ -229,6 +306,12 @@ LEXT(set_tcr) bl EXT(pinst_set_tcr) mov lr, x1 #else +#if defined(HAS_VMSA_LOCK) + // assert TCR unlocked + mrs x1, ARM64_REG_VMSA_LOCK_EL1 + and x1, x1, #(VMSA_LOCK_TCR_EL1) + cbnz x1, L_set_locked_reg_panic +#endif /* defined(HAS_VMSA_LOCK) */ msr TCR_EL1, x0 #endif /* defined(KERNEL_INTRITY_KTRR) */ isb sy @@ -256,7 +339,7 @@ L_set_tcr_panic_str: L_set_locked_reg_panic_str: .asciz "attempt to set locked register: (%llx)\n" #else -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) mov x1, lr bl EXT(pinst_set_tcr) mov lr, x1 @@ -690,6 +773,9 @@ LEXT(arm64_prepare_for_sleep) orr x1, x1, #( ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep) and x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask)) orr x1, x1, #( ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep) +#if HAS_RETENTION_STATE + orr x1, x1, #(ARM64_REG_ACC_OVRD_disPioOnWfiCpu) +#endif msr ARM64_REG_ACC_OVRD, x1 @@ -701,9 +787,12 @@ LEXT(arm64_prepare_for_sleep) // Set "OK to power down" () mrs x0, ARM64_REG_CYC_OVRD orr x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down) +#if HAS_RETENTION_STATE + orr x0, x0, #(ARM64_REG_CYC_OVRD_disWfiRetn) +#endif msr ARM64_REG_CYC_OVRD, x0 -#if defined(APPLEMONSOON) +#if defined(APPLEMONSOON) || defined(APPLEVORTEX) ARM64_IS_PCORE x0 cbz x0, Lwfi_inst // skip if not p-core @@ -717,6 +806,12 @@ LEXT(arm64_prepare_for_sleep) * and re-enabling GUPS, which forces the prefetch queue to * drain. This should be done as close to wfi as possible, i.e. * at the very end of arm64_prepare_for_sleep(). */ +#if defined(APPLEVORTEX) + /* : Cyprus A0/A1 parts have a similar + * bug in the HSP prefetcher that can be worked around through + * the same method mentioned above for Skye. */ + SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL x0, VORTEX_CPU_VERSION_B0, Lwfi_inst +#endif mrs x0, ARM64_REG_HID10 orr x0, x0, #(ARM64_REG_HID10_DisHwpGups) msr ARM64_REG_HID10, x0 @@ -750,6 +845,21 @@ LEXT(arm64_force_wfi_clock_gate) ARM64_STACK_EPILOG +#if HAS_RETENTION_STATE + .text + .align 2 + .globl EXT(arm64_retention_wfi) +LEXT(arm64_retention_wfi) + wfi + cbz lr, Lwfi_retention // If lr is 0, we entered retention state and lost all GPRs except sp and pc + ret // Otherwise just return to cpu_idle() +Lwfi_retention: + mov x0, #1 + bl EXT(ClearIdlePop) + mov x0, #0 + bl EXT(cpu_idle_exit) // cpu_idle_exit(from_reset = FALSE) + b . // cpu_idle_exit() should never return +#endif #if defined(APPLETYPHOON) @@ -931,7 +1041,7 @@ LEXT(arm64_replace_bootstack) mrs x4, DAIF // Load current DAIF; use x4 as pinst may trash x1-x3 msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF) // Disable IRQ/FIQ/serror // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) mov x1, lr bl EXT(pinst_spsel_1) mov lr, x1 diff --git a/osfmk/arm64/monotonic.h b/osfmk/arm64/monotonic.h index 992d501db..cd62e333a 100644 --- a/osfmk/arm64/monotonic.h +++ b/osfmk/arm64/monotonic.h @@ -44,7 +44,11 @@ __BEGIN_DECLS #include +#if HAS_UNCORE_CTRS +#define MT_NDEVS 2 +#else /* HAS_UNCORE_CTRS */ #define MT_NDEVS 1 +#endif /* !HAS_UNCORE_CTRS */ #define MT_CORE_CYCLES 0 #define MT_CORE_INSTRS 1 @@ -67,6 +71,12 @@ __BEGIN_DECLS #define PMCR0_PMAI (UINT64_C(1) << 11) #define PMCR0_PMI(REG) ((REG) & PMCR0_PMAI) +#if HAS_UNCORE_CTRS + +#define UPMSR "s3_7_c15_c6_4" +#define UPMSR_PMI(REG) ((REG) & 0x1) + +#endif /* HAS_UNCORE_CTRS */ static inline bool mt_pmi_pending(uint64_t * restrict pmcr0_out, @@ -82,7 +92,18 @@ mt_pmi_pending(uint64_t * restrict pmcr0_out, } *pmcr0_out = pmcr0; +#if HAS_UNCORE_CTRS + extern bool mt_uncore_enabled; + if (mt_uncore_enabled) { + uint64_t upmsr = __builtin_arm_rsr64(UPMSR); + if (UPMSR_PMI(upmsr)) { + pmi = true; + } + *upmsr_out = upmsr; + } +#else /* HAS_UNCORE_CTRS */ #pragma unused(upmsr_out) +#endif /* !HAS_UNCORE_CTRS */ return pmi; } diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index 25895247f..51361f693 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -281,12 +281,942 @@ core_idle(__unused cpu_data_t *cpu) #pragma mark uncore performance monitor +#if HAS_UNCORE_CTRS + +static bool mt_uncore_initted = false; + +/* + * Uncore Performance Monitor + * + * Uncore performance monitors provide event-counting for the last-level caches + * (LLCs). Each LLC has its own uncore performance monitor, which can only be + * accessed by cores that use that LLC. Like the core performance monitoring + * unit, uncore counters are configured globally. If there is more than one + * LLC on the system, PIO reads must be used to satisfy uncore requests (using + * the `_r` remote variants of the access functions). Otherwise, local MSRs + * suffice (using the `_l` local variants of the access functions). + */ + +#if UNCORE_PER_CLUSTER +static vm_size_t cpm_impl_size = 0; +static uintptr_t cpm_impl[__ARM_CLUSTER_COUNT__] = {}; +static uintptr_t cpm_impl_phys[__ARM_CLUSTER_COUNT__] = {}; +#endif /* UNCORE_PER_CLUSTER */ + +#if UNCORE_VERSION >= 2 +/* + * V2 uncore monitors feature a CTI mechanism -- the second bit of UPMSR is + * used to track if a CTI has been triggered due to an overflow. + */ +#define UPMSR_OVF_POS 2 +#else /* UNCORE_VERSION >= 2 */ +#define UPMSR_OVF_POS 1 +#endif /* UNCORE_VERSION < 2 */ +#define UPMSR_OVF(R, CTR) ((R) >> ((CTR) + UPMSR_OVF_POS) & 0x1) +#define UPMSR_OVF_MASK (((UINT64_C(1) << UNCORE_NCTRS) - 1) << UPMSR_OVF_POS) + +#define UPMPCM "s3_7_c15_c5_4" +#define UPMPCM_CORE(ID) (UINT64_C(1) << (ID)) + +/* + * The uncore_pmi_mask is a bitmask of CPUs that receive uncore PMIs. It's + * initialized by uncore_init and controllable by the uncore_pmi_mask boot-arg. + */ +static int32_t uncore_pmi_mask = 0; + +/* + * The uncore_active_ctrs is a bitmask of uncore counters that are currently + * requested. + */ +static uint16_t uncore_active_ctrs = 0; +static_assert(sizeof(uncore_active_ctrs) * CHAR_BIT >= UNCORE_NCTRS, + "counter mask should fit the full range of counters"); + +/* + * mt_uncore_enabled is true when any uncore counters are active. + */ +bool mt_uncore_enabled = false; + +/* + * Each uncore unit has its own monitor, corresponding to the memory hierarchy + * of the LLCs. + */ +#if UNCORE_PER_CLUSTER +#define UNCORE_NMONITORS (__ARM_CLUSTER_COUNT__) +#else /* UNCORE_PER_CLUSTER */ +#define UNCORE_NMONITORS (1) +#endif /* !UNCORE_PER_CLUSTER */ + +/* + * The uncore_events are the event configurations for each uncore counter -- as + * a union to make it easy to program the hardware registers. + */ +static struct uncore_config { + union { + uint8_t uce_ctrs[UNCORE_NCTRS]; + uint64_t uce_regs[UNCORE_NCTRS / 8]; + } uc_events; + union { + uint16_t uccm_masks[UNCORE_NCTRS]; + uint64_t uccm_regs[UNCORE_NCTRS / 4]; + } uc_cpu_masks[UNCORE_NMONITORS]; +} uncore_config; + +static struct uncore_monitor { + /* + * The last snapshot of each of the hardware counter values. + */ + uint64_t um_snaps[UNCORE_NCTRS]; + + /* + * The accumulated counts for each counter. + */ + uint64_t um_counts[UNCORE_NCTRS]; + + /* + * Protects accessing the hardware registers and fields in this structure. + */ + lck_spin_t um_lock; + + /* + * Whether this monitor needs its registers restored after wake. + */ + bool um_sleeping; +} uncore_monitors[UNCORE_NMONITORS]; + +static unsigned int +uncmon_get_curid(void) +{ +#if UNCORE_PER_CLUSTER + return cpu_cluster_id(); +#else /* UNCORE_PER_CLUSTER */ + return 0; +#endif /* !UNCORE_PER_CLUSTER */ +} + +/* + * Per-monitor locks are required to prevent races with the PMI handlers, not + * from other CPUs that are configuring (those are serialized with monotonic's + * per-device lock). + */ + +static int +uncmon_lock(struct uncore_monitor *mon) +{ + int intrs_en = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(&mon->um_lock); + return intrs_en; +} + +static void +uncmon_unlock(struct uncore_monitor *mon, int intrs_en) +{ + lck_spin_unlock(&mon->um_lock); + (void)ml_set_interrupts_enabled(intrs_en); +} + +/* + * Helper functions for accessing the hardware -- these require the monitor be + * locked to prevent other CPUs' PMI handlers from making local modifications + * or updating the counts. + */ + +#if UNCORE_VERSION >= 2 +#define UPMCR0_INTEN_POS 20 +#define UPMCR0_INTGEN_POS 16 +#else /* UNCORE_VERSION >= 2 */ +#define UPMCR0_INTEN_POS 12 +#define UPMCR0_INTGEN_POS 8 +#endif /* UNCORE_VERSION < 2 */ +enum { + UPMCR0_INTGEN_OFF = 0, + /* fast PMIs are only supported on core CPMU */ + UPMCR0_INTGEN_AIC = 2, + UPMCR0_INTGEN_HALT = 3, + UPMCR0_INTGEN_FIQ = 4, +}; +/* always enable interrupts for all counters */ +#define UPMCR0_INTEN (((1ULL << UNCORE_NCTRS) - 1) << UPMCR0_INTEN_POS) +/* route uncore PMIs through the FIQ path */ +#define UPMCR0_INIT (UPMCR0_INTEN | (UPMCR0_INTGEN_FIQ << UPMCR0_INTGEN_POS)) + +/* + * Turn counting on for counters set in the `enctrmask` and off, otherwise. + */ +static inline void +uncmon_set_counting_locked_l(__unused unsigned int monid, uint64_t enctrmask) +{ + /* + * UPMCR0 controls which counters are enabled and how interrupts are generated + * for overflows. + */ +#define UPMCR0 "s3_7_c15_c0_4" + __builtin_arm_wsr64(UPMCR0, UPMCR0_INIT | enctrmask); +} + +#if UNCORE_PER_CLUSTER + +/* + * Turn counting on for counters set in the `enctrmask` and off, otherwise. + */ +static inline void +uncmon_set_counting_locked_r(unsigned int monid, uint64_t enctrmask) +{ + const uintptr_t upmcr0_offset = 0x4180; + *(uint64_t *)(cpm_impl[monid] + upmcr0_offset) = UPMCR0_INIT | enctrmask; +} + +#endif /* UNCORE_PER_CLUSTER */ + +/* + * The uncore performance monitoring counters (UPMCs) are 48-bits wide. The + * high bit is an overflow bit, triggering a PMI, providing 47 usable bits. + */ + +#define UPMC_MAX ((UINT64_C(1) << 48) - 1) + +/* + * The `__builtin_arm_{r,w}sr` functions require constant strings, since the + * MSR/MRS instructions encode the registers as immediates. Otherwise, this + * would be indexing into an array of strings. + */ + +#define UPMC0 "s3_7_c15_c7_4" +#define UPMC1 "s3_7_c15_c8_4" +#define UPMC2 "s3_7_c15_c9_4" +#define UPMC3 "s3_7_c15_c10_4" +#define UPMC4 "s3_7_c15_c11_4" +#define UPMC5 "s3_7_c15_c12_4" +#define UPMC6 "s3_7_c15_c13_4" +#define UPMC7 "s3_7_c15_c14_4" +#if UNCORE_NCTRS > 8 +#define UPMC8 "s3_7_c15_c0_5" +#define UPMC9 "s3_7_c15_c1_5" +#define UPMC10 "s3_7_c15_c2_5" +#define UPMC11 "s3_7_c15_c3_5" +#define UPMC12 "s3_7_c15_c4_5" +#define UPMC13 "s3_7_c15_c5_5" +#define UPMC14 "s3_7_c15_c6_5" +#define UPMC15 "s3_7_c15_c7_5" +#endif /* UNCORE_NCTRS > 8 */ + +#define UPMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \ + X(6, A); X(7, A) +#if UNCORE_NCTRS <= 8 +#define UPMC_ALL(X, A) UPMC_0_7(X, A) +#else /* UNCORE_NCTRS <= 8 */ +#define UPMC_8_15(X, A) X(8, A); X(9, A); X(10, A); X(11, A); X(12, A); \ + X(13, A); X(14, A); X(15, A) +#define UPMC_ALL(X, A) UPMC_0_7(X, A); UPMC_8_15(X, A) +#endif /* UNCORE_NCTRS > 8 */ + +static inline uint64_t +uncmon_read_counter_locked_l(__unused unsigned int monid, unsigned int ctr) +{ + assert(ctr < UNCORE_NCTRS); + switch (ctr) { +#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(UPMC ## CTR) + UPMC_ALL(UPMC_RD, 0); +#undef UPMC_RD + default: + panic("monotonic: invalid counter read %u", ctr); + __builtin_unreachable(); + } +} + +static inline void +uncmon_write_counter_locked_l(__unused unsigned int monid, unsigned int ctr, + uint64_t count) +{ + assert(count < UPMC_MAX); + assert(ctr < UNCORE_NCTRS); + switch (ctr) { +#define UPMC_WR(CTR, COUNT) case (CTR): \ + return __builtin_arm_wsr64(UPMC ## CTR, (COUNT)) + UPMC_ALL(UPMC_WR, count); +#undef UPMC_WR + default: + panic("monotonic: invalid counter write %u", ctr); + } +} + +#if UNCORE_PER_CLUSTER + +static const uint8_t clust_offs[__ARM_CLUSTER_COUNT__] = CPU_CLUSTER_OFFSETS; + +uintptr_t upmc_offs[UNCORE_NCTRS] = { + [0] = 0x4100, [1] = 0x4248, [2] = 0x4110, [3] = 0x4250, [4] = 0x4120, + [5] = 0x4258, [6] = 0x4130, [7] = 0x4260, [8] = 0x4140, [9] = 0x4268, + [10] = 0x4150, [11] = 0x4270, [12] = 0x4160, [13] = 0x4278, + [14] = 0x4170, [15] = 0x4280, +}; + +static inline uint64_t +uncmon_read_counter_locked_r(unsigned int mon_id, unsigned int ctr) +{ + assert(mon_id < __ARM_CLUSTER_COUNT__); + assert(ctr < UNCORE_NCTRS); + return *(uint64_t *)(cpm_impl[mon_id] + upmc_offs[ctr]); +} + +static inline void +uncmon_write_counter_locked_r(unsigned int mon_id, unsigned int ctr, + uint64_t count) +{ + assert(count < UPMC_MAX); + assert(ctr < UNCORE_NCTRS); + assert(mon_id < __ARM_CLUSTER_COUNT__); + *(uint64_t *)(cpm_impl[mon_id] + upmc_offs[ctr]) = count; +} + +#endif /* UNCORE_PER_CLUSTER */ + +static inline void +uncmon_update_locked(unsigned int monid, unsigned int curid, unsigned int ctr) +{ + struct uncore_monitor *mon = &uncore_monitors[monid]; + uint64_t snap = 0; + if (curid == monid) { + snap = uncmon_read_counter_locked_l(monid, ctr); + } else { +#if UNCORE_PER_CLUSTER + snap = uncmon_read_counter_locked_r(monid, ctr); +#endif /* UNCORE_PER_CLUSTER */ + } + /* counters should increase monotonically */ + assert(snap >= mon->um_snaps[ctr]); + mon->um_counts[ctr] += snap - mon->um_snaps[ctr]; + mon->um_snaps[ctr] = snap; +} + +static inline void +uncmon_program_events_locked_l(unsigned int monid) +{ + /* + * UPMESR[01] is the event selection register that determines which event a + * counter will count. + */ +#define UPMESR0 "s3_7_c15_c1_4" + CTRL_REG_SET(UPMESR0, uncore_config.uc_events.uce_regs[0]); + +#if UNCORE_NCTRS > 8 +#define UPMESR1 "s3_7_c15_c11_5" + CTRL_REG_SET(UPMESR1, uncore_config.uc_events.uce_regs[1]); +#endif /* UNCORE_NCTRS > 8 */ + + /* + * UPMECM[0123] are the event core masks for each counter -- whether or not + * that counter counts events generated by an agent. These are set to all + * ones so the uncore counters count events from all cores. + * + * The bits are based off the start of the cluster -- e.g. even if a core + * has a CPU ID of 4, it might be the first CPU in a cluster. Shift the + * registers right by the ID of the first CPU in the cluster. + */ +#define UPMECM0 "s3_7_c15_c3_4" +#define UPMECM1 "s3_7_c15_c4_4" + + CTRL_REG_SET(UPMECM0, + uncore_config.uc_cpu_masks[monid].uccm_regs[0]); + CTRL_REG_SET(UPMECM1, + uncore_config.uc_cpu_masks[monid].uccm_regs[1]); + +#if UNCORE_NCTRS > 8 +#define UPMECM2 "s3_7_c15_c8_5" +#define UPMECM3 "s3_7_c15_c9_5" + + CTRL_REG_SET(UPMECM2, + uncore_config.uc_cpu_masks[monid].uccm_regs[2]); + CTRL_REG_SET(UPMECM3, + uncore_config.uc_cpu_masks[monid].uccm_regs[3]); +#endif /* UNCORE_NCTRS > 8 */ +} + +#if UNCORE_PER_CLUSTER + +static inline void +uncmon_program_events_locked_r(unsigned int monid) +{ + const uintptr_t upmesr_offs[2] = {[0] = 0x41b0, [1] = 0x41b8, }; + + for (unsigned int i = 0; i < sizeof(upmesr_offs) / sizeof(upmesr_offs[0]); + i++) { + *(uint64_t *)(cpm_impl[monid] + upmesr_offs[i]) = + uncore_config.uc_events.uce_regs[i]; + } + + const uintptr_t upmecm_offs[4] = { + [0] = 0x4190, [1] = 0x4198, [2] = 0x41a0, [3] = 0x41a8, + }; + + for (unsigned int i = 0; i < sizeof(upmecm_offs) / sizeof(upmecm_offs[0]); + i++) { + *(uint64_t *)(cpm_impl[monid] + upmecm_offs[i]) = + uncore_config.uc_cpu_masks[monid].uccm_regs[i]; + } +} + +#endif /* UNCORE_PER_CLUSTER */ + +static void +uncmon_clear_int_locked_l(__unused unsigned int monid) +{ + __builtin_arm_wsr64(UPMSR, 0); +} + +#if UNCORE_PER_CLUSTER + +static void +uncmon_clear_int_locked_r(unsigned int monid) +{ + const uintptr_t upmsr_off = 0x41c0; + *(uint64_t *)(cpm_impl[monid] + upmsr_off) = 0; +} + +#endif /* UNCORE_PER_CLUSTER */ + +/* + * Get the PMI mask for the provided `monid` -- that is, the bitmap of CPUs + * that should be sent PMIs for a particular monitor. + */ +static uint64_t +uncmon_get_pmi_mask(unsigned int monid) +{ + uint64_t pmi_mask = uncore_pmi_mask; + +#if UNCORE_PER_CLUSTER + /* + * Set up the mask for the high bits. + */ + uint64_t clust_cpumask; + if (monid == __ARM_CLUSTER_COUNT__ - 1) { + clust_cpumask = UINT64_MAX; + } else { + clust_cpumask = ((1ULL << clust_offs[monid + 1]) - 1); + } + + /* + * Mask off the low bits, if necessary. + */ + if (clust_offs[monid] != 0) { + clust_cpumask &= ~((1ULL << clust_offs[monid]) - 1); + } + + pmi_mask &= clust_cpumask; +#else /* UNCORE_PER_CLUSTER */ +#pragma unused(monid) +#endif /* !UNCORE_PER_CLUSTER */ + + return pmi_mask; +} + +/* + * Initialization routines for the uncore counters. + */ + +static void +uncmon_init_locked_l(unsigned int monid) +{ + /* + * UPMPCM defines the PMI core mask for the UPMCs -- which cores should + * receive interrupts on overflow. + */ + CTRL_REG_SET(UPMPCM, uncmon_get_pmi_mask(monid)); + uncmon_set_counting_locked_l(monid, + mt_uncore_enabled ? uncore_active_ctrs : 0); +} + +#if UNCORE_PER_CLUSTER + +static vm_size_t acc_impl_size = 0; +static uintptr_t acc_impl[__ARM_CLUSTER_COUNT__] = {}; +static uintptr_t acc_impl_phys[__ARM_CLUSTER_COUNT__] = {}; + +static void +uncmon_init_locked_r(unsigned int monid) +{ + const uintptr_t upmpcm_off = 0x1010; + + *(uint64_t *)(acc_impl[monid] + upmpcm_off) = uncmon_get_pmi_mask(monid); + uncmon_set_counting_locked_r(monid, + mt_uncore_enabled ? uncore_active_ctrs : 0); +} + +#endif /* UNCORE_PER_CLUSTER */ + +/* + * Initialize the uncore device for monotonic. + */ +static int +uncore_init(__unused mt_device_t dev) +{ +#if DEVELOPMENT || DEBUG + /* + * Development and debug kernels observe the `uncore_pmi_mask` boot-arg, + * allowing PMIs to be routed to the CPUs present in the supplied bitmap. + * Do some sanity checks on the value provided. + */ + bool parsed_arg = PE_parse_boot_argn("uncore_pmi_mask", &uncore_pmi_mask, + sizeof(uncore_pmi_mask)); + if (parsed_arg) { +#if UNCORE_PER_CLUSTER + if (__builtin_popcount(uncore_pmi_mask) != __ARM_CLUSTER_COUNT__) { + panic("monotonic: invalid uncore PMI mask 0x%x", uncore_pmi_mask); + } + for (unsigned int i = 0; i < __ARM_CLUSTER_COUNT__; i++) { + if (__builtin_popcountll(uncmon_get_pmi_mask(i)) != 1) { + panic("monotonic: invalid uncore PMI CPU for cluster %d in mask 0x%x", + i, uncore_pmi_mask); + } + } +#else /* UNCORE_PER_CLUSTER */ + if (__builtin_popcount(uncore_pmi_mask) != 1) { + panic("monotonic: invalid uncore PMI mask 0x%x", uncore_pmi_mask); + } +#endif /* !UNCORE_PER_CLUSTER */ + } else +#endif /* DEVELOPMENT || DEBUG */ + { +#if UNCORE_PER_CLUSTER + for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) { + /* route to the first CPU in each cluster */ + uncore_pmi_mask |= (1ULL << clust_offs[i]); + } +#else /* UNCORE_PER_CLUSTER */ + /* arbitrarily route to core 0 */ + uncore_pmi_mask |= 1; +#endif /* !UNCORE_PER_CLUSTER */ + } + assert(uncore_pmi_mask != 0); + + unsigned int curmonid = uncmon_get_curid(); + + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { +#if UNCORE_PER_CLUSTER + cpm_impl[monid] = (uintptr_t)ml_io_map(cpm_impl_phys[monid], + cpm_impl_size); + assert(cpm_impl[monid] != 0); + + acc_impl[monid] = (uintptr_t)ml_io_map(acc_impl_phys[monid], + acc_impl_size); + assert(acc_impl[monid] != 0); +#endif /* UNCORE_PER_CLUSTER */ + + struct uncore_monitor *mon = &uncore_monitors[monid]; + lck_spin_init(&mon->um_lock, mt_lock_grp, NULL); + + int intrs_en = uncmon_lock(mon); + if (monid != curmonid) { +#if UNCORE_PER_CLUSTER + uncmon_init_locked_r(monid); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_init_locked_l(monid); + } + uncmon_unlock(mon, intrs_en); + } + + mt_uncore_initted = true; + + return 0; +} + +/* + * Support for monotonic's mtd_read function. + */ + +static void +uncmon_read_all_counters(unsigned int monid, unsigned int curmonid, + uint64_t ctr_mask, uint64_t *counts) +{ + struct uncore_monitor *mon = &uncore_monitors[monid]; + + int intrs_en = uncmon_lock(mon); + + for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) { + if (ctr_mask & (1ULL << ctr)) { + uncmon_update_locked(monid, curmonid, ctr); + counts[ctr] = mon->um_counts[ctr]; + } + } + + uncmon_unlock(mon, intrs_en); +} + +/* + * Read all monitor's counters. + */ +static int +uncore_read(uint64_t ctr_mask, uint64_t *counts_out) +{ + assert(ctr_mask != 0); + assert(counts_out != NULL); + + if (!uncore_active_ctrs) { + return EPWROFF; + } + if (ctr_mask & ~uncore_active_ctrs) { + return EINVAL; + } + + unsigned int curmonid = uncmon_get_curid(); + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { + /* + * Find this monitor's starting offset into the `counts_out` array. + */ + uint64_t *counts = counts_out + (UNCORE_NCTRS * monid); + + uncmon_read_all_counters(monid, curmonid, ctr_mask, counts); + } + + return 0; +} + +/* + * Support for monotonic's mtd_add function. + */ + +/* + * Add an event to the current uncore configuration. This doesn't take effect + * until the counters are enabled again, so there's no need to involve the + * monitors. + */ +static int +uncore_add(struct monotonic_config *config, uint32_t *ctr_out) +{ + if (mt_uncore_enabled) { + return EBUSY; + } + + uint32_t available = ~uncore_active_ctrs & config->allowed_ctr_mask; + + if (available == 0) { + return ENOSPC; + } + + uint32_t valid_ctrs = (UINT32_C(1) << UNCORE_NCTRS) - 1; + if ((available & valid_ctrs) == 0) { + return E2BIG; + } + + uint32_t ctr = __builtin_ffsll(available) - 1; + + uncore_active_ctrs |= UINT64_C(1) << ctr; + uncore_config.uc_events.uce_ctrs[ctr] = config->event; + uint64_t cpu_mask = UINT64_MAX; + if (config->cpu_mask != 0) { + cpu_mask = config->cpu_mask; + } + for (int i = 0; i < UNCORE_NMONITORS; i++) { +#if UNCORE_PER_CLUSTER + const unsigned int shift = clust_offs[i]; +#else /* UNCORE_PER_CLUSTER */ + const unsigned int shift = 0; +#endif /* !UNCORE_PER_CLUSTER */ + uncore_config.uc_cpu_masks[i].uccm_masks[ctr] = cpu_mask >> shift; + } + + *ctr_out = ctr; + return 0; +} + +/* + * Support for monotonic's mtd_reset function. + */ + +/* + * Reset all configuration and disable the counters if they're currently + * counting. + */ +static void +uncore_reset(void) +{ + mt_uncore_enabled = false; + + unsigned int curmonid = uncmon_get_curid(); + + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { + struct uncore_monitor *mon = &uncore_monitors[monid]; + bool remote = monid != curmonid; + + int intrs_en = uncmon_lock(mon); + if (remote) { +#if UNCORE_PER_CLUSTER + uncmon_set_counting_locked_r(monid, 0); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_set_counting_locked_l(monid, 0); + } + + for (int ctr = 0; ctr < UNCORE_NCTRS; ctr++) { + if (uncore_active_ctrs & (1U << ctr)) { + if (remote) { +#if UNCORE_PER_CLUSTER + uncmon_write_counter_locked_r(monid, ctr, 0); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_write_counter_locked_l(monid, ctr, 0); + } + } + } + + memset(&mon->um_snaps, 0, sizeof(mon->um_snaps)); + memset(&mon->um_counts, 0, sizeof(mon->um_counts)); + if (remote) { +#if UNCORE_PER_CLUSTER + uncmon_clear_int_locked_r(monid); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_clear_int_locked_l(monid); + } + + uncmon_unlock(mon, intrs_en); + } + + uncore_active_ctrs = 0; + memset(&uncore_config, 0, sizeof(uncore_config)); + + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { + struct uncore_monitor *mon = &uncore_monitors[monid]; + bool remote = monid != curmonid; + + int intrs_en = uncmon_lock(mon); + if (remote) { +#if UNCORE_PER_CLUSTER + uncmon_program_events_locked_r(monid); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_program_events_locked_l(monid); + } + uncmon_unlock(mon, intrs_en); + } +} + +/* + * Support for monotonic's mtd_enable function. + */ + +static void +uncmon_set_enabled_l(unsigned int monid, bool enable) +{ + struct uncore_monitor *mon = &uncore_monitors[monid]; + int intrs_en = uncmon_lock(mon); + + if (enable) { + uncmon_program_events_locked_l(monid); + uncmon_set_counting_locked_l(monid, uncore_active_ctrs); + } else { + uncmon_set_counting_locked_l(monid, 0); + } + + uncmon_unlock(mon, intrs_en); +} + +#if UNCORE_PER_CLUSTER + +static void +uncmon_set_enabled_r(unsigned int monid, bool enable) +{ + struct uncore_monitor *mon = &uncore_monitors[monid]; + int intrs_en = uncmon_lock(mon); + + if (enable) { + uncmon_program_events_locked_r(monid); + uncmon_set_counting_locked_r(monid, uncore_active_ctrs); + } else { + uncmon_set_counting_locked_r(monid, 0); + } + + uncmon_unlock(mon, intrs_en); +} + +#endif /* UNCORE_PER_CLUSTER */ + +static void +uncore_set_enabled(bool enable) +{ + mt_uncore_enabled = enable; + + unsigned int curmonid = uncmon_get_curid(); + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { + if (monid != curmonid) { +#if UNCORE_PER_CLUSTER + uncmon_set_enabled_r(monid, enable); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_set_enabled_l(monid, enable); + } + } +} + +/* + * Hooks in the machine layer. + */ + +static void +uncore_fiq(uint64_t upmsr) +{ + /* + * Determine which counters overflowed. + */ + uint64_t disable_ctr_mask = (upmsr & UPMSR_OVF_MASK) >> UPMSR_OVF_POS; + /* should not receive interrupts from inactive counters */ + assert(!(disable_ctr_mask & ~uncore_active_ctrs)); + + unsigned int monid = uncmon_get_curid(); + struct uncore_monitor *mon = &uncore_monitors[monid]; + + int intrs_en = uncmon_lock(mon); + + /* + * Disable any counters that overflowed. + */ + uncmon_set_counting_locked_l(monid, + uncore_active_ctrs & ~disable_ctr_mask); + + /* + * With the overflowing counters disabled, capture their counts and reset + * the UPMCs and their snapshots to 0. + */ + for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) { + if (UPMSR_OVF(upmsr, ctr)) { + uncmon_update_locked(monid, monid, ctr); + mon->um_snaps[ctr] = 0; + uncmon_write_counter_locked_l(monid, ctr, 0); + } + } + + /* + * Acknowledge the interrupt, now that any overflowed PMCs have been reset. + */ + uncmon_clear_int_locked_l(monid); + + /* + * Re-enable all active counters. + */ + uncmon_set_counting_locked_l(monid, uncore_active_ctrs); + + uncmon_unlock(mon, intrs_en); +} + +static void +uncore_save(void) +{ + if (!uncore_active_ctrs) { + return; + } + + unsigned int curmonid = uncmon_get_curid(); + + for (unsigned int monid = 0; monid < UNCORE_NMONITORS; monid++) { + struct uncore_monitor *mon = &uncore_monitors[monid]; + int intrs_en = uncmon_lock(mon); + + if (mt_uncore_enabled) { + if (monid != curmonid) { +#if UNCORE_PER_CLUSTER + uncmon_set_counting_locked_r(monid, 0); +#endif /* UNCORE_PER_CLUSTER */ + } else { + uncmon_set_counting_locked_l(monid, 0); + } + } + + for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) { + if (uncore_active_ctrs & (1U << ctr)) { + uncmon_update_locked(monid, curmonid, ctr); + } + } + + mon->um_sleeping = true; + uncmon_unlock(mon, intrs_en); + } +} + +static void +uncore_restore(void) +{ + if (!uncore_active_ctrs) { + return; + } + unsigned int curmonid = uncmon_get_curid(); + + struct uncore_monitor *mon = &uncore_monitors[curmonid]; + int intrs_en = uncmon_lock(mon); + if (!mon->um_sleeping) { + goto out; + } + + for (unsigned int ctr = 0; ctr < UNCORE_NCTRS; ctr++) { + if (uncore_active_ctrs & (1U << ctr)) { + uncmon_write_counter_locked_l(curmonid, ctr, mon->um_snaps[ctr]); + } + } + uncmon_program_events_locked_l(curmonid); + uncmon_init_locked_l(curmonid); + mon->um_sleeping = false; + +out: + uncmon_unlock(mon, intrs_en); +} + +static void +uncore_early_init(void) +{ +#if UNCORE_PER_CLUSTER + /* + * Initialize the necessary PIO physical regions from the device tree. + */ + DTEntry armio_entry = NULL; + if ((DTFindEntry("name", "arm-io", &armio_entry) != kSuccess)) { + panic("unable to find arm-io DT entry"); + } + + uint64_t *regs; + unsigned int regs_size = 0; + if (DTGetProperty(armio_entry, "acc-impl", (void **)®s, ®s_size) != + kSuccess) { + panic("unable to find acc-impl DT property"); + } + /* + * Two 8-byte values are expected for each cluster -- the physical address + * of the region and its size. + */ + const unsigned int expected_size = + (typeof(expected_size))sizeof(uint64_t) * __ARM_CLUSTER_COUNT__ * 2; + if (regs_size != expected_size) { + panic("invalid size for acc-impl DT property"); + } + for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) { + acc_impl_phys[i] = regs[i * 2]; + } + acc_impl_size = regs[1]; + + regs_size = 0; + if (DTGetProperty(armio_entry, "cpm-impl", (void **)®s, ®s_size) != + kSuccess) { + panic("unable to find cpm-impl property"); + } + if (regs_size != expected_size) { + panic("invalid size for cpm-impl DT property"); + } + for (int i = 0; i < __ARM_CLUSTER_COUNT__; i++) { + cpm_impl_phys[i] = regs[i * 2]; + } + cpm_impl_size = regs[1]; +#endif /* UNCORE_PER_CLUSTER */ +} + +#endif /* HAS_UNCORE_CTRS */ #pragma mark common hooks void mt_early_init(void) { +#if HAS_UNCORE_CTRS + uncore_early_init(); +#endif /* HAS_UNCORE_CTRS */ } void @@ -330,11 +1260,19 @@ mt_cpu_up(cpu_data_t *cpu) void mt_sleep(void) { +#if HAS_UNCORE_CTRS + uncore_save(); +#endif /* HAS_UNCORE_CTRS */ } void mt_wake_per_core(void) { +#if HAS_UNCORE_CTRS + if (mt_uncore_initted) { + uncore_restore(); + } +#endif /* HAS_UNCORE_CTRS */ } uint64_t @@ -439,7 +1377,11 @@ mt_fiq(void *cpu, uint64_t pmcr0, uint64_t upmsr) mt_cpu_pmi(cpu, pmcr0); #endif /* !CPMU_AIC_PMI */ +#if HAS_UNCORE_CTRS + uncore_fiq(upmsr); +#else /* HAS_UNCORE_CTRS */ #pragma unused(upmsr) +#endif /* !HAS_UNCORE_CTRS */ } static uint32_t mt_xc_sync; @@ -487,6 +1429,19 @@ struct mt_device mt_devices[] = { .mtd_name = "core", .mtd_init = core_init, }, +#if HAS_UNCORE_CTRS + [1] = { + .mtd_name = "uncore", + .mtd_init = uncore_init, + .mtd_add = uncore_add, + .mtd_reset = uncore_reset, + .mtd_enable = uncore_set_enabled, + .mtd_read = uncore_read, + + .mtd_nmonitors = UNCORE_NMONITORS, + .mtd_ncounters = UNCORE_NCTRS, + } +#endif /* HAS_UNCORE_CTRS */ }; static_assert( diff --git a/osfmk/arm64/pinst.s b/osfmk/arm64/pinst.s index 740a63915..71c1230f8 100644 --- a/osfmk/arm64/pinst.s +++ b/osfmk/arm64/pinst.s @@ -104,7 +104,7 @@ _pinst_set_sctlr: #endif /* defined(KERNEL_INTEGRITY_KTRR) */ -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) .text .section __LAST,__pinst @@ -123,5 +123,48 @@ _pinst_spsel_1: check_instruction x2, x3, __pinst_spsel_1, 0xd65f03c0d50041bf b __pinst_spsel_1 -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#if __APRR_SUPPORTED__ + +/* + * APRR registers aren't covered by VMSA lockdown, so we'll keep these + * gadgets in pinst for protection against undesired execution. + */ + + .text + .section __LAST,__pinst + .align 2 + +__pinst_set_aprr_el0: + msr APRR_EL0, x0 + ret + +__pinst_set_aprr_el1: + msr APRR_EL1, x0 + ret + +__pinst_set_aprr_shadow_mask_en_el1: + msr APRR_SHADOW_MASK_EN_EL1, x0 + + ret + + .text + .section __TEXT_EXEC,__text + .align 2 + + .globl _pinst_set_aprr_el0 +_pinst_set_aprr_el0: + check_instruction x2, x3, __pinst_set_aprr_el0, 0xd65f03c0d51cf200 + b __pinst_set_aprr_el0 + + .globl _pinst_set_aprr_el1 +_pinst_set_aprr_el1: + check_instruction x2, x3, __pinst_set_aprr_el1, 0xd65f03c0d51cf220 + b __pinst_set_aprr_el1 + + .globl _pinst_set_aprr_shadow_mask_en_el1 +_pinst_set_aprr_shadow_mask_en_el1: + check_instruction x2, x3, __pinst_set_aprr_shadow_mask_en_el1, 0xd65f03c0d51cf2c0 + b __pinst_set_aprr_shadow_mask_en_el1 +#endif /* __APRR_SUPPORTED__ */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ diff --git a/osfmk/arm64/platform_tests.c b/osfmk/arm64/platform_tests.c index 8523c57ab..2fd98c9f7 100644 --- a/osfmk/arm64/platform_tests.c +++ b/osfmk/arm64/platform_tests.c @@ -89,6 +89,10 @@ kern_return_t arm64_late_pan_test(void); #include kern_return_t arm64_ropjop_test(void); #endif +#if defined(KERNEL_INTEGRITY_CTRR) +kern_return_t ctrr_test(void); +kern_return_t ctrr_test_cpu(void); +#endif #if HAS_TWO_STAGE_SPR_LOCK kern_return_t arm64_spr_lock_test(void); extern void arm64_msr_lock_test(uint64_t); @@ -270,6 +274,46 @@ lt_upgrade_downgrade_rw() lck_rw_done(<_rwlock); } +#if __AMP__ +const int limit = 1000000; +static int lt_stress_local_counters[MAX_CPUS]; + +lck_ticket_t lt_ticket_lock; + +static void +lt_stress_ticket_lock() +{ + int local_counter = 0; + + uint cpuid = current_processor()->cpu_id; + + kprintf("%s>cpu %d starting\n", __FUNCTION__, cpuid); + + lck_ticket_lock(<_ticket_lock); + lt_counter++; + local_counter++; + lck_ticket_unlock(<_ticket_lock); + + while (lt_counter < lt_target_done_threads) { + ; + } + + kprintf("%s>cpu %d started\n", __FUNCTION__, cpuid); + + while (lt_counter < limit) { + lck_ticket_lock(<_ticket_lock); + if (lt_counter < limit) { + lt_counter++; + local_counter++; + } + lck_ticket_unlock(<_ticket_lock); + } + + lt_stress_local_counters[cpuid] = local_counter; + + kprintf("%s>final counter %d cpu %d incremented the counter %d times\n", __FUNCTION__, lt_counter, cpuid, local_counter); +} +#endif static void lt_grab_hw_lock() @@ -571,6 +615,106 @@ lt_start_lock_thread(thread_continue_t func) thread_deallocate(thread); } +#if __AMP__ +static void +lt_bound_thread(void *arg, wait_result_t wres __unused) +{ + void (*func)(void) = (void (*)(void))arg; + + int cpuid = OSIncrementAtomic((volatile SInt32 *)<_cpu_bind_id); + + processor_t processor = processor_list; + while ((processor != NULL) && (processor->cpu_id != cpuid)) { + processor = processor->processor_list; + } + + if (processor != NULL) { + thread_bind(processor); + } + + thread_block(THREAD_CONTINUE_NULL); + + func(); + + OSIncrementAtomic((volatile SInt32*) <_done_threads); +} + +static void +lt_e_thread(void *arg, wait_result_t wres __unused) +{ + void (*func)(void) = (void (*)(void))arg; + + thread_t thread = current_thread(); + + spl_t s = splsched(); + thread_lock(thread); + thread->sched_flags |= TH_SFLAG_ECORE_ONLY; + thread_unlock(thread); + splx(s); + + thread_block(THREAD_CONTINUE_NULL); + + func(); + + OSIncrementAtomic((volatile SInt32*) <_done_threads); +} + +static void +lt_p_thread(void *arg, wait_result_t wres __unused) +{ + void (*func)(void) = (void (*)(void))arg; + + thread_t thread = current_thread(); + + spl_t s = splsched(); + thread_lock(thread); + thread->sched_flags |= TH_SFLAG_PCORE_ONLY; + thread_unlock(thread); + splx(s); + + thread_block(THREAD_CONTINUE_NULL); + + func(); + + OSIncrementAtomic((volatile SInt32*) <_done_threads); +} + +static void +lt_start_lock_thread_e(thread_continue_t func) +{ + thread_t thread; + kern_return_t kr; + + kr = kernel_thread_start(lt_e_thread, func, &thread); + assert(kr == KERN_SUCCESS); + + thread_deallocate(thread); +} + +static void +lt_start_lock_thread_p(thread_continue_t func) +{ + thread_t thread; + kern_return_t kr; + + kr = kernel_thread_start(lt_p_thread, func, &thread); + assert(kr == KERN_SUCCESS); + + thread_deallocate(thread); +} + +static void +lt_start_lock_thread_bound(thread_continue_t func) +{ + thread_t thread; + kern_return_t kr; + + kr = kernel_thread_start(lt_bound_thread, func, &thread); + assert(kr == KERN_SUCCESS); + + thread_deallocate(thread); +} +#endif static kern_return_t lt_test_locks() @@ -762,6 +906,47 @@ lt_test_locks() lt_wait_for_lock_test_threads(); T_EXPECT_EQ_UINT(lt_counter, LOCK_TEST_ITERATIONS * lt_target_done_threads, NULL); +#if __AMP__ + /* Ticket locks stress test */ + T_LOG("Running Ticket locks stress test with lck_ticket_lock()"); + extern unsigned int real_ncpus; + lck_ticket_init(<_ticket_lock); + lt_reset(); + lt_target_done_threads = real_ncpus; + for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) { + lt_start_lock_thread_bound(lt_stress_ticket_lock); + } + lt_wait_for_lock_test_threads(); + bool starvation = false; + uint total_local_count = 0; + for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) { + starvation = starvation || (lt_stress_local_counters[processor->cpu_id] < 10); + total_local_count += lt_stress_local_counters[processor->cpu_id]; + } + if (total_local_count != lt_counter) { + T_FAIL("Lock failure\n"); + } else if (starvation) { + T_FAIL("Lock starvation found\n"); + } else { + T_PASS("Ticket locks stress test with lck_ticket_lock()"); + } + + /* AMP ticket locks stress test */ + T_LOG("Running AMP Ticket locks stress test bound to clusters with lck_ticket_lock()"); + lt_reset(); + lt_target_done_threads = real_ncpus; + for (processor_t processor = processor_list; processor != NULL; processor = processor->processor_list) { + processor_set_t pset = processor->processor_set; + if (pset->pset_cluster_type == PSET_AMP_P) { + lt_start_lock_thread_p(lt_stress_ticket_lock); + } else if (pset->pset_cluster_type == PSET_AMP_E) { + lt_start_lock_thread_e(lt_stress_ticket_lock); + } else { + lt_start_lock_thread(lt_stress_ticket_lock); + } + } + lt_wait_for_lock_test_threads(); +#endif /* HW locks: trylocks */ T_LOG("Running test with hw_lock_try()"); @@ -1198,6 +1383,136 @@ arm64_munger_test() return 0; } +#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +SECURITY_READ_ONLY_LATE(uint64_t) ctrr_ro_test; +uint64_t ctrr_nx_test = 0xd65f03c0; /* RET */ +volatile uint64_t ctrr_exception_esr; +vm_offset_t ctrr_test_va; +vm_offset_t ctrr_test_page; + +kern_return_t +ctrr_test(void) +{ + processor_t p; + boolean_t ctrr_disable = FALSE; + + PE_parse_boot_argn("-unsafe_kernel_text", &ctrr_disable, sizeof(ctrr_disable)); + + if (ctrr_disable) { + T_LOG("Skipping CTRR test when -unsafe_kernel_text boot-arg present"); + return KERN_SUCCESS; + } + + T_LOG("Running CTRR test."); + + for (p = processor_list; p != NULL; p = p->processor_list) { + thread_bind(p); + thread_block(THREAD_CONTINUE_NULL); + T_LOG("Running CTRR test on cpu %d\n", p->cpu_id); + ctrr_test_cpu(); + } + + /* unbind thread from specific cpu */ + thread_bind(PROCESSOR_NULL); + thread_block(THREAD_CONTINUE_NULL); + + return KERN_SUCCESS; +} + +/* test CTRR on a cpu, caller to bind thread to desired cpu */ +/* ctrr_test_page was reserved during bootstrap process */ +kern_return_t +ctrr_test_cpu(void) +{ + ppnum_t ro_pn, nx_pn; + uint64_t *ctrr_ro_test_ptr; + void (*ctrr_nx_test_ptr)(void); + kern_return_t kr; + uint64_t prot = 0; + extern uint64_t rorgn_begin, rorgn_end; + extern vm_offset_t virtual_space_start; + + /* rorgn = [rorgn_begin_va, rorgn_end_va) */ + + vm_offset_t rorgn_begin_va = phystokv(rorgn_begin); + vm_offset_t rorgn_end_va = phystokv(rorgn_end) + PAGE_SIZE; + vm_offset_t ro_test_va = (vm_offset_t)&ctrr_ro_test; + vm_offset_t nx_test_va = (vm_offset_t)&ctrr_nx_test; + + T_EXPECT(rorgn_begin_va <= ro_test_va && ro_test_va < rorgn_end_va, "Expect ro_test_va to be inside the CTRR region"); + T_EXPECT((nx_test_va < rorgn_begin_va) ^ (nx_test_va >= rorgn_end_va), "Expect nx_test_va to be outside the CTRR region"); + + ro_pn = pmap_find_phys(kernel_pmap, ro_test_va); + nx_pn = pmap_find_phys(kernel_pmap, nx_test_va); + T_EXPECT(ro_pn && nx_pn, "Expect ro page number and nx page number to be non zero"); + + T_LOG("test virtual page: %p, ctrr_ro_test: %p, ctrr_nx_test: %p, ro_pn: %x, nx_pn: %x ", + (void *)ctrr_test_page, &ctrr_ro_test, &ctrr_nx_test, ro_pn, nx_pn); + + prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page); + T_EXPECT(~prot & ARM_TTE_VALID, "Expect ctrr_test_page to be unmapped"); + + T_LOG("Read only region test mapping virtual page %p to CTRR RO page number %d", ctrr_test_page, ro_pn); + kr = pmap_enter(kernel_pmap, ctrr_test_page, ro_pn, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + T_EXPECT(kr == KERN_SUCCESS, "Expect pmap_enter of RW mapping to succeed"); + + // assert entire mmu prot path (Hierarchical protection model) is NOT RO + // fetch effective block level protections from table/block entries + prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page); + T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RWNA && (prot & ARM_PTE_PNX), "Mapping is EL1 RWNX"); + + ctrr_test_va = ctrr_test_page + (ro_test_va & PAGE_MASK); + ctrr_ro_test_ptr = (void *)ctrr_test_va; + + T_LOG("Read only region test writing to %p to provoke data abort", ctrr_ro_test_ptr); + + // should cause data abort + *ctrr_ro_test_ptr = 1; + + // ensure write permission fault at expected level + // data abort handler will set ctrr_exception_esr when ctrr_test_va takes a permission fault + + T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_DABORT_EL1, "Data Abort from EL1 expected"); + T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); + T_EXPECT(ESR_ISS(ctrr_exception_esr) & ISS_DA_WNR, "Write Fault Expected"); + + ctrr_test_va = 0; + ctrr_exception_esr = 0; + pmap_remove(kernel_pmap, ctrr_test_page, ctrr_test_page + PAGE_SIZE); + + T_LOG("No execute test mapping virtual page %p to CTRR PXN page number %d", ctrr_test_page, nx_pn); + + kr = pmap_enter(kernel_pmap, ctrr_test_page, nx_pn, + VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + T_EXPECT(kr == KERN_SUCCESS, "Expect pmap_enter of RX mapping to succeed"); + + // assert entire mmu prot path (Hierarchical protection model) is NOT XN + prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page); + T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RONA && (~prot & ARM_PTE_PNX), "Mapping is EL1 ROX"); + + ctrr_test_va = ctrr_test_page + (nx_test_va & PAGE_MASK); + ctrr_nx_test_ptr = (void *)ctrr_test_va; + + T_LOG("No execute test calling ctrr_nx_test_ptr(): %p to provoke instruction abort", ctrr_nx_test_ptr); + +#if __has_feature(ptrauth_calls) + // must sign before calling if we're creating function pointers out of thin air + ctrr_nx_test_ptr = ptrauth_sign_unauthenticated(ctrr_nx_test_ptr, ptrauth_key_function_pointer, 0); +#endif + // should cause prefetch abort + ctrr_nx_test_ptr(); + + // TODO: ensure execute permission fault at expected level + T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_IABORT_EL1, "Instruction abort from EL1 Expected"); + T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); + + ctrr_test_va = 0; + ctrr_exception_esr = 0; + pmap_remove(kernel_pmap, ctrr_test_page, ctrr_test_page + PAGE_SIZE); + return KERN_SUCCESS; +} +#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ #if HAS_TWO_STAGE_SPR_LOCK diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index 66a551ffe..f4d967d14 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -86,6 +86,17 @@ * global mappings would be visible to userspace unless we invalidate them on * eret. */ +#if XNU_MONITOR +/* + * Please note that because we indirect through the thread register in order to + * locate the kernel, and because we unmap most of the kernel, the security + * model of the PPL is undermined by __ARM_KERNEL_PROTECT__, as we rely on + * kernel controlled data to direct codeflow in the exception vectors. + * + * If we want to ship XNU_MONITOR paired with __ARM_KERNEL_PROTECT__, we will + * need to find a performant solution to this problem. + */ +#endif #endif /* __ARM_KERNEL_PROTECT */ /* @@ -1552,13 +1563,223 @@ typedef enum { #define CORESIGHT_REGIONS 4 #define CORESIGHT_SIZE 0x1000 +#if __APRR_SUPPORTED__ +/* + * APRR_EL0/APRR_EL1 + * + * 63 0 + * +--------------------+ + * | Attr[15:0]RWX[3:0] | + * +--------------------+ + * + * These registers consist of 16 4-bit fields. + * + * The attribute index consists of the access protection + * and execution protections on a mapping. The index + * for a given mapping type is constructed as follows. + * + * Attribute Index + * + * 3 2 1 0 + * +-------+-------+-----+----+ + * | AP[1] | AP[0] | PXN | XN | + * +-------+-------+-----+----+ + * + * The attribute for a given index determines what + * protections are disabled for that mappings type + * (protections beyond the scope of the standard ARM + * protections for a mapping cannot be granted via + * APRR). + * + * Attribute + * + * 3 2 1 0 + * +----------+---+---+---+ + * | Reserved | R | W | X | + * +----------+---+---+---+ + * + * Where: + * R: Read is allowed. + * W: Write is allowed. + * X: Execute is allowed. + */ + +#define APRR_IDX_XN (1ULL) +#define APRR_IDX_PXN (2ULL) + + +#define APRR_IDX_XN_SHIFT (0ULL) +#define APRR_IDX_PXN_SHIFT (1ULL) +#define APRR_IDX_APSHIFT (2ULL) + +#endif /* __APRR_SUPPORTED__ */ + + +#if __APRR_SUPPORTED__ + +#define APRR_ATTR_X (1ULL) +#define APRR_ATTR_W (2ULL) +#define APRR_ATTR_R (4ULL) + +#define APRR_ATTR_WX (APRR_ATTR_W | APRR_ATTR_X) +#define APRR_ATTR_RX (APRR_ATTR_R | APRR_ATTR_X) +#define APRR_ATTR_RWX (APRR_ATTR_R | APRR_ATTR_W | APRR_ATTR_X) + +#define APRR_ATTR_NONE (0ULL) +#define APRR_ATTR_MASK (APRR_ATTR_RWX) + +#define APRR_RESERVED_MASK (0x8888888888888888ULL) +#endif /* __APRR_SUPPORTED__ */ + +#if __APRR_SUPPORTED__ +#define XPRR_FIRM_RX_PERM (0ULL) +#define XPRR_PPL_RW_PERM (1ULL) +#define XPRR_FIRM_RO_PERM (2ULL) +#define XPRR_KERN_RW_PERM (3ULL) +#define XPRR_FIRM_RW_PERM (4ULL) +#define XPRR_USER_JIT_PERM (5ULL) +#define XPRR_KERN0_RW_PERM (6ULL) +#define XPRR_USER_RW_PERM (7ULL) +#define XPRR_PPL_RX_PERM (8ULL) +#define XPRR_PPL_RO_PERM (9ULL) +#define XPRR_KERN_RX_PERM (10ULL) +#define XPRR_KERN_RO_PERM (11ULL) +#define XPRR_KERN0_RX_PERM (12ULL) +#define XPRR_USER_RX_PERM (13ULL) +#define XPRR_KERN0_RO_PERM (14ULL) +#define XPRR_USER_RO_PERM (15ULL) +#define XPRR_MAX_PERM (15ULL) + +#define XPRR_VERSION_NONE (0ULL) +#define XPRR_VERSION_APRR (1ULL) + + +#endif /* __APRR_SUPPORTED__*/ + +#if __APRR_SUPPORTED__ +/* Indices for attributes, named based on how we intend to use them. */ +#define APRR_FIRM_RX_INDEX (0ULL) /* AP_RWNA, PX, X */ +#define APRR_FIRM_RO_INDEX (1ULL) /* AP_RWNA, PX, XN */ +#define APRR_PPL_RW_INDEX (2ULL) /* AP_RWNA, PXN, X */ +#define APRR_KERN_RW_INDEX (3ULL) /* AP_RWNA, PXN, XN */ +#define APRR_FIRM_RW_INDEX (4ULL) /* AP_RWRW, PX, X */ +#define APRR_KERN0_RW_INDEX (5ULL) /* AP_RWRW, PX, XN */ +#define APRR_USER_JIT_INDEX (6ULL) /* AP_RWRW, PXN, X */ +#define APRR_USER_RW_INDEX (7ULL) /* AP_RWRW, PXN, XN */ +#define APRR_PPL_RX_INDEX (8ULL) /* AP_RONA, PX, X */ +#define APRR_KERN_RX_INDEX (9ULL) /* AP_RONA, PX, XN */ +#define APRR_PPL_RO_INDEX (10ULL) /* AP_RONA, PXN, X */ +#define APRR_KERN_RO_INDEX (11ULL) /* AP_RONA, PXN, XN */ +#define APRR_KERN0_RX_INDEX (12ULL) /* AP_RORO, PX, X */ +#define APRR_KERN0_RO_INDEX (13ULL) /* AP_RORO, PX, XN */ +#define APRR_USER_RX_INDEX (14ULL) /* AP_RORO, PXN, X */ +#define APRR_USER_RO_INDEX (15ULL) /* AP_RORO, PXN, XN */ +#define APRR_MAX_INDEX (15ULL) /* For sanity checking index values */ +#endif /* __APRR_SUPPORTED */ + + +#if __APRR_SUPPORTED__ +#define APRR_SHIFT_FOR_IDX(x) \ + ((x) << 2ULL) + +/* Shifts for attributes, named based on how we intend to use them. */ +#define APRR_FIRM_RX_SHIFT (0ULL) /* AP_RWNA, PX, X */ +#define APRR_FIRM_RO_SHIFT (4ULL) /* AP_RWNA, PX, XN */ +#define APRR_PPL_RW_SHIFT (8ULL) /* AP_RWNA, PXN, X */ +#define APRR_KERN_RW_SHIFT (12ULL) /* AP_RWNA, PXN, XN */ +#define APRR_FIRM_RW_SHIFT (16ULL) /* AP_RWRW, PX, X */ +#define APRR_KERN0_RW_SHIFT (20ULL) /* AP_RWRW, PX, XN */ +#define APRR_USER_JIT_SHIFT (24ULL) /* AP_RWRW, PXN, X */ +#define APRR_USER_RW_SHIFT (28ULL) /* AP_RWRW, PXN, XN */ +#define APRR_PPL_RX_SHIFT (32ULL) /* AP_RONA, PX, X */ +#define APRR_KERN_RX_SHIFT (36ULL) /* AP_RONA, PX, XN */ +#define APRR_PPL_RO_SHIFT (40ULL) /* AP_RONA, PXN, X */ +#define APRR_KERN_RO_SHIFT (44ULL) /* AP_RONA, PXN, XN */ +#define APRR_KERN0_RX_SHIFT (48ULL) /* AP_RORO, PX, X */ +#define APRR_KERN0_RO_SHIFT (52ULL) /* AP_RORO, PX, XN */ +#define APRR_USER_RX_SHIFT (56ULL) /* AP_RORO, PXN, X */ +#define APRR_USER_RO_SHIFT (60ULL) /* AP_RORO, PXN, XN */ + +#define ARM_PTE_APRR_MASK \ + (ARM_PTE_APMASK | ARM_PTE_PNXMASK | ARM_PTE_NXMASK) + +#define ARM_PTE_XPRR_MASK ARM_PTE_APRR_MASK + +#define APRR_INDEX_TO_PTE(x) \ + ((pt_entry_t) \ + (((x) & 0x8) ? ARM_PTE_AP(0x2) : 0) | \ + (((x) & 0x4) ? ARM_PTE_AP(0x1) : 0) | \ + (((x) & 0x2) ? ARM_PTE_PNX : 0) | \ + (((x) & 0x1) ? ARM_PTE_NX : 0)) + +#define PTE_TO_APRR_INDEX(x) \ + ((ARM_PTE_EXTRACT_AP(x) << APRR_IDX_APSHIFT) | \ + (((x) & ARM_PTE_PNXMASK) ? APRR_IDX_PXN : 0) | \ + (((x) & ARM_PTE_NXMASK) ? APRR_IDX_XN : 0)) + +#endif /* __APRR_SUPPORTED__ */ + +#if __APRR_SUPPORTED__ + +#define APRR_EXTRACT_IDX_ATTR(_aprr_value, _idx) \ + (((_aprr_value) >> APRR_SHIFT_FOR_IDX(_idx)) & APRR_ATTR_MASK) + +#define APRR_REMOVE(x) (~(x)) + +#define APRR_EL1_UNRESTRICTED (0x4455445566666677ULL) + +#define APRR_EL1_RESET \ + APRR_EL1_UNRESTRICTED + +#define APRR_EL1_BASE \ + APRR_EL1_UNRESTRICTED + +#if XNU_MONITOR +#define APRR_EL1_DEFAULT \ + (APRR_EL1_BASE & \ + (APRR_REMOVE((APRR_ATTR_WX << APRR_PPL_RW_SHIFT) | \ + (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \ + (APRR_ATTR_WX << APRR_PPL_RX_SHIFT)))) + +#define APRR_EL1_PPL \ + (APRR_EL1_BASE & \ + (APRR_REMOVE((APRR_ATTR_X << APRR_PPL_RW_SHIFT) | \ + (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \ + (APRR_ATTR_W << APRR_PPL_RX_SHIFT)))) +#else +#define APRR_EL1_DEFAULT \ + APRR_EL1_BASE +#endif +#define APRR_EL0_UNRESTRICTED (0x4545010167670101ULL) +#define APRR_EL0_RESET \ + APRR_EL0_UNRESTRICTED +#if XNU_MONITOR +#define APRR_EL0_BASE \ + (APRR_EL0_UNRESTRICTED & \ + (APRR_REMOVE((APRR_ATTR_RWX << APRR_PPL_RW_SHIFT) | \ + (APRR_ATTR_RWX << APRR_PPL_RX_SHIFT) | \ + (APRR_ATTR_RWX << APRR_PPL_RO_SHIFT)))) +#else +#define APRR_EL0_BASE \ + APRR_EL0_UNRESTRICTED +#endif +#define APRR_EL0_JIT_RW \ + (APRR_EL0_BASE & APRR_REMOVE(APRR_ATTR_X << APRR_USER_JIT_SHIFT)) +#define APRR_EL0_JIT_RX \ + (APRR_EL0_BASE & APRR_REMOVE(APRR_ATTR_W << APRR_USER_JIT_SHIFT)) +#define APRR_EL0_JIT_RWX \ + APRR_EL0_BASE +#define APRR_EL0_DEFAULT \ + APRR_EL0_BASE + +#endif /* __APRR_SUPPORTED__ */ /* @@ -1694,5 +1915,12 @@ b.mi $2 // Unsigned "strictly less than" #define MSR(reg, src) __asm__ volatile ("msr " reg ", %0" :: "r" (src)) #define MRS(dest, reg) __asm__ volatile ("mrs %0, " reg : "=r" (dest)) +#if XNU_MONITOR +#define __ARM_PTE_PHYSMAP__ 1 +#define PPL_STATE_KERNEL 0 +#define PPL_STATE_DISPATCH 1 +#define PPL_STATE_PANIC 2 +#define PPL_STATE_EXCEPTION 3 +#endif #endif /* _ARM64_PROC_REG_H_ */ diff --git a/osfmk/arm64/sleh.c b/osfmk/arm64/sleh.c index 705e31444..b6a1f10ae 100644 --- a/osfmk/arm64/sleh.c +++ b/osfmk/arm64/sleh.c @@ -203,6 +203,8 @@ extern volatile uint32_t spr_lock_exception_esr; #define CPU_NAME "Twister" #elif defined(APPLEHURRICANE) #define CPU_NAME "Hurricane" +#elif defined(APPLELIGHTNING) +#define CPU_NAME "Lightning" #else #define CPU_NAME "Unknown" #endif @@ -222,6 +224,10 @@ extern volatile uint32_t spr_lock_exception_esr; #define WT_REASON_REG_VIOLATION 8 #endif +#if defined(HAS_IPI) +void cpu_signal_handler(void); +extern unsigned int gFastIPI; +#endif /* defined(HAS_IPI) */ extern vm_offset_t static_memory_end; @@ -502,6 +508,18 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) thread_exception_return(); case ESR_EC_IABORT_EL1: +#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) + { + extern volatile vm_offset_t ctrr_test_va; + if (ctrr_test_va && far == ctrr_test_va) { + extern volatile uint64_t ctrr_exception_esr; + ctrr_exception_esr = esr; + /* return to the instruction immediately after the call to NX page */ + set_saved_state_pc(state, get_saved_state_lr(state)); + break; + } + } +#endif panic_with_thread_kernel_state("Kernel instruction fetch abort", state); @@ -944,7 +962,7 @@ is_translation_fault(fault_status_t status) } } -#if __ARM_PAN_AVAILABLE__ +#if __ARM_PAN_AVAILABLE__ || defined(KERNEL_INTEGRITY_CTRR) static int is_permission_fault(fault_status_t status) { @@ -1189,6 +1207,15 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad * when running with KTRR. */ +#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) + extern volatile vm_offset_t ctrr_test_va; + if (ctrr_test_va && fault_addr == ctrr_test_va && is_permission_fault(fault_code)) { + extern volatile uint64_t ctrr_exception_esr; + ctrr_exception_esr = esr; + add_saved_state_pc(state, 4); + return; + } +#endif #if __ARM_PAN_AVAILABLE__ && defined(CONFIG_XNUPOST) if (is_permission_fault(fault_code) && !(get_saved_state_cpsr(state) & PSR64_PAN) && @@ -1497,6 +1524,22 @@ sleh_fiq(arm_saved_state_t *state) uint64_t pmcr0 = 0, upmsr = 0; #endif /* MONOTONIC_FIQ */ +#if defined(HAS_IPI) + boolean_t is_ipi = FALSE; + uint64_t ipi_sr = 0; + + if (gFastIPI) { + MRS(ipi_sr, ARM64_REG_IPI_SR); + + if (ipi_sr & 1) { + is_ipi = TRUE; + } + } + + if (is_ipi) { + type = DBG_INTR_TYPE_IPI; + } else +#endif /* defined(HAS_IPI) */ #if MONOTONIC_FIQ if (mt_pmi_pending(&pmcr0, &upmsr)) { type = DBG_INTR_TYPE_PMI; @@ -1508,6 +1551,21 @@ sleh_fiq(arm_saved_state_t *state) sleh_interrupt_handler_prologue(state, type); +#if defined(HAS_IPI) + if (is_ipi) { + /* + * Order is important here: we must ack the IPI by writing IPI_SR + * before we call cpu_signal_handler(). Otherwise, there will be + * a window between the completion of pending-signal processing in + * cpu_signal_handler() and the ack during which a newly-issued + * IPI to this CPU may be lost. ISB is required to ensure the msr + * is retired before execution of cpu_signal_handler(). + */ + MSR(ARM64_REG_IPI_SR, ipi_sr); + __builtin_arm_isb(ISB_SY); + cpu_signal_handler(); + } else +#endif /* defined(HAS_IPI) */ #if MONOTONIC_FIQ if (type == DBG_INTR_TYPE_PMI) { mt_fiq(getCpuDatap(), pmcr0, upmsr); diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index 4e964ca8b..a5d29d6c6 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -39,6 +39,33 @@ #endif /* __ARM_KERNEL_PROTECT__ */ +#if __APRR_SUPPORTED__ + +.macro MSR_APRR_EL1_X0 +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) + bl EXT(pinst_set_aprr_el1) +#else + msr APRR_EL1, x0 +#endif +.endmacro + +.macro MSR_APRR_EL0_X0 +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) + bl EXT(pinst_set_aprr_el0) +#else + msr APRR_EL0, x0 +#endif +.endmacro + +.macro MSR_APRR_SHADOW_MASK_EN_EL1_X0 +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) + bl EXT(pinst_set_aprr_shadow_mask_en_el1) +#else + msr APRR_SHADOW_MASK_EN_EL1, x0 +#endif +.endmacro + +#endif /* __APRR_SUPPORTED__ */ .macro MSR_VBAR_EL1_X0 #if defined(KERNEL_INTEGRITY_KTRR) @@ -128,13 +155,32 @@ LEXT(reset_vector) msr OSLAR_EL1, xzr msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts -#if !(defined(KERNEL_INTEGRITY_KTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) // Set low reset vector before attempting any loads adrp x0, EXT(LowExceptionVectorBase)@page add x0, x0, EXT(LowExceptionVectorBase)@pageoff msr VBAR_EL1, x0 #endif +#if __APRR_SUPPORTED__ + MOV64 x0, APRR_EL1_DEFAULT +#if XNU_MONITOR + adrp x4, EXT(pmap_ppl_locked_down)@page + ldrb w5, [x4, #EXT(pmap_ppl_locked_down)@pageoff] + cmp w5, #0 + b.ne 1f + + // If the PPL is not locked down, we start in PPL mode. + MOV64 x0, APRR_EL1_PPL +1: +#endif /* XNU_MONITOR */ + + MSR_APRR_EL1_X0 + + // Load up the default APRR_EL0 value. + MOV64 x0, APRR_EL0_DEFAULT + MSR_APRR_EL0_X0 +#endif /* __APRR_SUPPORTED__ */ #if defined(KERNEL_INTEGRITY_KTRR) /* @@ -179,7 +225,11 @@ Lskip_ktrr: adrp x19, EXT(ResetHandlerData)@page // Get address of the reset handler data add x19, x19, EXT(ResetHandlerData)@pageoff mrs x15, MPIDR_EL1 // Load MPIDR to get CPU number +#if HAS_CLUSTER + and x0, x15, #0xFFFF // CPU number in Affinity0, cluster ID in Affinity1 +#else and x0, x15, #0xFF // CPU number is in MPIDR Affinity Level 0 +#endif ldr x1, [x19, CPU_DATA_ENTRIES] // Load start of data entries add x3, x1, MAX_CPUS * 16 // end addr of data entries = start + (16 * MAX_CPUS) Lcheck_cpu_data_entry: @@ -194,6 +244,57 @@ Lnext_cpu_data_entry: b.eq Lskip_cpu_reset_handler // Not found b Lcheck_cpu_data_entry // loop Lfound_cpu_data_entry: +#if defined(KERNEL_INTEGRITY_CTRR) + /* + * Program and lock CTRR if this CPU is non-boot cluster master. boot cluster will be locked + * in machine_lockdown. pinst insns protected by VMSA_LOCK + * A_PXN and A_MMUON_WRPROTECT options provides something close to KTRR behavior + */ + + /* spin until bootstrap core has completed machine lockdown */ + adrp x17, EXT(lockdown_done)@page +1: + ldr x18, [x17, EXT(lockdown_done)@pageoff] + cbz x18, 1b + + // load stashed rorgn_begin + adrp x17, EXT(rorgn_begin)@page + add x17, x17, EXT(rorgn_begin)@pageoff + ldr x17, [x17] + // if rorgn_begin is zero, we're debugging. skip enabling ctrr + cbz x17, Lskip_ctrr + + // load stashed rorgn_end + adrp x19, EXT(rorgn_end)@page + add x19, x19, EXT(rorgn_end)@pageoff + ldr x19, [x19] + cbz x19, Lskip_ctrr + + mrs x18, ARM64_REG_CTRR_LOCK_EL1 + cbnz x18, Lskip_ctrr /* don't touch if already locked */ + ldr w18, [x21, CLUSTER_MASTER] /* cluster master is unsigned int (32bit) */ + cbz w18, Lspin_ctrr_unlocked /* non-cluster master spins if CTRR unlocked (unexpected) */ + msr ARM64_REG_CTRR_A_LWR_EL1, x17 + msr ARM64_REG_CTRR_A_UPR_EL1, x19 + mov x18, #(CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT) + msr ARM64_REG_CTRR_CTL_EL1, x18 + mov x18, #1 + msr ARM64_REG_CTRR_LOCK_EL1, x18 + + + isb + tlbi vmalle1 + dsb ish + isb +Lspin_ctrr_unlocked: + /* we shouldn't ever be here as cpu start is serialized by cluster in cpu_start(), + * and first core started in cluster is designated cluster master and locks + * both core and cluster. subsequent cores in same cluster will run locked from + * from reset vector */ + mrs x18, ARM64_REG_CTRR_LOCK_EL1 + cbz x18, Lspin_ctrr_unlocked +Lskip_ctrr: +#endif adrp x20, EXT(const_boot_args)@page add x20, x20, EXT(const_boot_args)@pageoff ldr x0, [x21, CPU_RESET_HANDLER] // Call CPU reset handler @@ -210,7 +311,13 @@ Lfound_cpu_data_entry: bne Lskip_cpu_reset_handler 1: +#if HAS_NEX_PG + bl EXT(set_nex_pg) +#endif +#if HAS_BP_RET + bl EXT(set_bp_ret) +#endif #if __ARM_KERNEL_PROTECT__ && defined(KERNEL_INTEGRITY_KTRR) /* @@ -299,7 +406,7 @@ LEXT(LowExceptionVectorBase) b . .align 12, 0 -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* * Provide a global symbol so that we can narrow the V=P mapping to cover * this page during arm_vm_init. @@ -308,7 +415,7 @@ LEXT(LowExceptionVectorBase) .globl EXT(bootstrap_instructions) LEXT(bootstrap_instructions) -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ .align 2 .globl EXT(resume_idle_cpu) LEXT(resume_idle_cpu) @@ -325,13 +432,13 @@ LEXT(start_cpu) .align 2 start_cpu: -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) // This is done right away in reset vector for pre-KTRR devices // Set low reset vector now that we are in the KTRR-free zone adrp x0, EXT(LowExceptionVectorBase)@page add x0, x0, EXT(LowExceptionVectorBase)@pageoff MSR_VBAR_EL1_X0 -#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ // x20 set to BootArgs phys address // x21 set to cpu data phys address @@ -353,7 +460,7 @@ start_cpu: // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) mov x1, lr bl EXT(pinst_spsel_1) mov lr, x1 @@ -494,6 +601,29 @@ LEXT(start_first_cpu) add x0, x0, EXT(LowExceptionVectorBase)@pageoff MSR_VBAR_EL1_X0 +#if __APRR_SUPPORTED__ + // Save the LR + mov x1, lr + +#if XNU_MONITOR + // If the PPL is supported, we start out in PPL mode. + MOV64 x0, APRR_EL1_PPL +#else + // Otherwise, we start out in default mode. + MOV64 x0, APRR_EL1_DEFAULT +#endif + + // Set the APRR state for EL1. + MSR_APRR_EL1_X0 + + // Set the APRR state for EL0. + MOV64 x0, APRR_EL0_DEFAULT + MSR_APRR_EL0_X0 + + + // Restore the LR. + mov lr, x1 +#endif /* __APRR_SUPPORTED__ */ // Get the kernel memory parameters from the boot args ldr x22, [x20, BA_VIRT_BASE] // Get the kernel virt base @@ -514,7 +644,7 @@ LEXT(start_first_cpu) sub x0, x0, x23 // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) bl EXT(pinst_spsel_1) #else msr SPSel, #1 @@ -657,7 +787,7 @@ common_start: * TTBR0 - V=P table @ top of kernel * TTBR1 - KVA table @ top of kernel + 1 page */ -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* Note that for KTRR configurations, the V=P map will be modified by * arm_vm_init.c. */ @@ -1075,9 +1205,186 @@ Lskip_skye_post_a1_workarounds: #endif /* defined(APPLEMONSOON) */ +#if defined(APPLEVORTEX) + ARM64_IS_PCORE x15 + // Skip if not P-core + cbz x15, Lskip_cyprus_pcore_only + mrs x12, ARM64_REG_HID1 + + mrs x13, MIDR_EL1 + ubfx x14, x13, #MIDR_EL1_PNUM_SHIFT, #12 + // Should be applied to all Aruba variants, but only Cyprus variants B0 and later + cmp x14, #0xb // Part number 11 => Cyprus, 16 => Aruba + bne Lbr_kill + ubfx x14, x13, #MIDR_EL1_VAR_SHIFT, #4 + cbz x14, Lskip_br_kill // variant 0 => Cyprus AX, 1 => Cyprus BX + +Lbr_kill: + + // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution + orr x12, x12, ARM64_REG_HID1_enaBrKillLimit + +Lskip_br_kill: + + // rdar://problem/34435356: segfaults due to IEX clock-gating + orr x12, x12, ARM64_REG_HID1_rccForceAllIexL3ClksOn + msr ARM64_REG_HID1, x12 + +#if ARM64_BOARD_CONFIG_T8027 + // rdar://problem/40695685: Enable BIF fill buffer stall logic to prevent skid buffer overflow (Aruba A1 only) + mrs x12, ARM64_REG_HID5 + orr x12, x12, ARM64_REG_HID5_EnableDnFIFORdStall + msr ARM64_REG_HID5, x12 + +#endif /* ARM64_BOARD_CONFIG_T8027 */ + + // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. + // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations + mrs x12, ARM64_REG_HID4 + orr x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd + msr ARM64_REG_HID4, x12 + + // rdar://problem/38482968: [Cyprus Tunable] Poisoned cache line crossing younger load is not redirected by older load-barrier + mrs x12, ARM64_REG_HID3 + orr x12, x12, ARM64_REG_HID3_DisColorOpt + msr ARM64_REG_HID3, x12 + + // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation + mrs x12, ARM64_REG_HID11 + orr x12, x12, ARM64_REG_HID11_DisX64NTLnchOpt + msr ARM64_REG_HID11, x12 + + b Lskip_cyprus_ecore_only + +Lskip_cyprus_pcore_only: + + // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. + // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations + mrs x12, ARM64_REG_EHID4 + orr x12, x12, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd + msr ARM64_REG_EHID4, x12 + + // rdar://problem/36595004: Poisoned younger load is not redirected by older load-acquire + mrs x12, ARM64_REG_EHID3 + orr x12, x12, ARM64_REG_EHID3_DisColorOpt + msr ARM64_REG_EHID3, x12 + + // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating + mrs x12, ARM64_REG_EHID10 + orr x12, x12, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff + msr ARM64_REG_EHID10, x12 + +Lskip_cyprus_ecore_only: + +#endif /* defined (APPLEVORTEX) */ + +#if defined(ARM64_BOARD_CONFIG_T8030) + // Cebu = cpu_base) && (prevfp < cpu_top)) != + ((fp >= cpu_base) && (fp < cpu_top))) { + switched_stacks = TRUE; + break; + } +#endif } if (!switched_stacks) { diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index 6801e0f31..e885cee9b 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -87,7 +87,9 @@ #include uint32_t hz_tick_interval = 1; +#if !HAS_CONTINUOUS_HWCLOCK static uint64_t has_monotonic_clock = 0; +#endif decl_simple_lock_data(, clock_lock); lck_grp_attr_t * settime_lock_grp_attr; @@ -234,6 +236,7 @@ bintime2nsclock(const struct bintime *_bt, clock_sec_t *secs, clock_usec_t *nano *nanosecs = ((uint64_t)NSEC_PER_SEC * (uint32_t)(_bt->frac >> 32)) >> 32; } +#if !defined(HAS_CONTINUOUS_HWCLOCK) static __inline void bintime2absolutetime(const struct bintime *_bt, uint64_t *abs) { @@ -250,6 +253,7 @@ struct latched_time { extern int kernel_sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +#endif /* * Time of day (calendar) variables. * @@ -270,7 +274,9 @@ static struct clock_calend { struct bintime offset; /* cumulative offset expressed in (sec, 64 bits frac of a second) */ struct bintime bintime; /* cumulative offset (it includes bootime) expressed in (sec, 64 bits frac of a second) */ struct bintime boottime; /* boot time expressed in (sec, 64 bits frac of a second) */ +#if !HAS_CONTINUOUS_HWCLOCK struct bintime basesleep; +#endif } clock_calend; static uint64_t ticks_per_sec; /* ticks in a second (expressed in abs time) */ @@ -957,6 +963,7 @@ print_all_clock_variables_internal(const char* func, struct clock_calend* clock_ func, clock_calend_cp->boottime.sec, clock_calend_cp->boottime.frac, (unsigned long)bootime_secs, bootime_microsecs); +#if !HAS_CONTINUOUS_HWCLOCK clock_sec_t basesleep_secs; clock_usec_t basesleep_microsecs; @@ -964,6 +971,7 @@ print_all_clock_variables_internal(const char* func, struct clock_calend* clock_ os_log(OS_LOG_DEFAULT, "%s basesleep.sec %ld basesleep.frac %llu basesleep_secs %lu basesleep_microsecs %d\n", func, clock_calend_cp->basesleep.sec, clock_calend_cp->basesleep.frac, (unsigned long)basesleep_secs, basesleep_microsecs); +#endif } @@ -1023,6 +1031,7 @@ clock_initialize_calendar(void) clock_usec_t utc_offset_microsecs; spl_t s; struct bintime bt; +#if !HAS_CONTINUOUS_HWCLOCK struct bintime monotonic_bt; struct latched_time monotonic_time; uint64_t monotonic_usec_total; @@ -1030,10 +1039,12 @@ clock_initialize_calendar(void) clock_usec_t microsys2, monotonic_usec; size_t size; +#endif //Get the UTC time and corresponding sys time PEGetUTCTimeOfDay(&secs, µsecs); clock_get_system_microtime(&sys, µsys); +#if !HAS_CONTINUOUS_HWCLOCK /* * If the platform has a monotonic clock, use kern.monotonicclock_usecs * to estimate the sleep/wake time, otherwise use the UTC time to estimate @@ -1049,6 +1060,7 @@ clock_initialize_calendar(void) absolutetime_to_microtime(monotonic_time.mach_time, &sys2, µsys2); os_log(OS_LOG_DEFAULT, "%s system has monotonic clock\n", __func__); } +#endif s = splclock(); clock_lock(); @@ -1099,6 +1111,7 @@ clock_initialize_calendar(void) clock_calend.s_scale_ns = NSEC_PER_SEC; clock_calend.s_adj_nsx = 0; +#if !HAS_CONTINUOUS_HWCLOCK if (has_monotonic_clock) { monotonic_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC; monotonic_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC; @@ -1111,6 +1124,7 @@ clock_initialize_calendar(void) // set the baseleep as the difference between monotonic clock - sys clock_calend.basesleep = monotonic_bt; } +#endif commpage_update_mach_continuous_time(mach_absolutetime_asleep); #if DEVELOPMENT || DEBUG @@ -1132,6 +1146,73 @@ clock_initialize_calendar(void) #endif } +#if HAS_CONTINUOUS_HWCLOCK + +static void +scale_sleep_time(void) +{ + /* Apply the current NTP frequency adjustment to the time slept. + * The frequency adjustment remains stable between calls to ntp_adjtime(), + * and should thus provide a reasonable approximation of the total adjustment + * required for the time slept. */ + struct bintime sleep_time; + uint64_t tick_scale_x, s_scale_ns; + int64_t s_adj_nsx; + int64_t sleep_adj = ntp_get_freq(); + if (sleep_adj) { + get_scale_factors_from_adj(sleep_adj, &tick_scale_x, &s_scale_ns, &s_adj_nsx); + sleep_time = scale_delta(mach_absolutetime_last_sleep, tick_scale_x, s_scale_ns, s_adj_nsx); + } else { + tick_scale_x = (uint64_t)1 << 63; + tick_scale_x /= ticks_per_sec; + tick_scale_x *= 2; + sleep_time.sec = mach_absolutetime_last_sleep / ticks_per_sec; + sleep_time.frac = (mach_absolutetime_last_sleep % ticks_per_sec) * tick_scale_x; + } + bintime_add(&clock_calend.offset, &sleep_time); + bintime_add(&clock_calend.bintime, &sleep_time); +} + +void +clock_wakeup_calendar(void) +{ + spl_t s; + + s = splclock(); + clock_lock(); + + commpage_disable_timestamp(); + + uint64_t abstime = mach_absolute_time(); + uint64_t total_sleep_time = ml_get_hwclock() - abstime; + + mach_absolutetime_last_sleep = total_sleep_time - mach_absolutetime_asleep; + mach_absolutetime_asleep = total_sleep_time; + + scale_sleep_time(); + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_CLOCK, MACH_EPOCH_CHANGE) | DBG_FUNC_NONE, + (uintptr_t) mach_absolutetime_last_sleep, + (uintptr_t) mach_absolutetime_asleep, + (uintptr_t) (mach_absolutetime_last_sleep >> 32), + (uintptr_t) (mach_absolutetime_asleep >> 32), + 0); + + commpage_update_mach_continuous_time(mach_absolutetime_asleep); + adjust_cont_time_thread_calls(); + + clock_unlock(); + splx(s); + + host_notify_calendar_change(); + +#if CONFIG_DTRACE + clock_track_calend_nowait(); +#endif +} + +#else /* HAS_CONTINUOUS_HWCLOCK */ void clock_wakeup_calendar(void) @@ -1348,6 +1429,7 @@ clock_wakeup_calendar(void) #endif } +#endif /* !HAS_CONTINUOUS_HWCLOCK */ /* * clock_get_boottime_nanotime: @@ -1586,6 +1668,9 @@ clock_deadline_for_periodic_event( uint64_t mach_continuous_time(void) { +#if HAS_CONTINUOUS_HWCLOCK + return ml_get_hwclock(); +#else while (1) { uint64_t read1 = mach_absolutetime_asleep; uint64_t absolute = mach_absolute_time(); @@ -1596,11 +1681,15 @@ mach_continuous_time(void) return absolute + read1; } } +#endif } uint64_t mach_continuous_approximate_time(void) { +#if HAS_CONTINUOUS_HWCLOCK + return ml_get_hwclock(); +#else while (1) { uint64_t read1 = mach_absolutetime_asleep; uint64_t absolute = mach_approximate_time(); @@ -1611,6 +1700,7 @@ mach_continuous_approximate_time(void) return absolute + read1; } } +#endif } /* diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index a4a617d32..20c95b23e 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -1413,13 +1413,15 @@ host_security_self(void) } kern_return_t -host_set_atm_diagnostic_flag(host_priv_t host_priv, uint32_t diagnostic_flag) +host_set_atm_diagnostic_flag(host_t host, uint32_t diagnostic_flag) { - if (host_priv == HOST_PRIV_NULL) { + if (host == HOST_NULL) { return KERN_INVALID_ARGUMENT; } - assert(host_priv == &realhost); + if (!IOTaskHasEntitlement(current_task(), "com.apple.private.set-atm-diagnostic-flag")) { + return KERN_NO_ACCESS; + } #if CONFIG_ATM return atm_set_diagnostic_config(diagnostic_flag); diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index d2e0c1746..d29c63124 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -627,8 +627,20 @@ ipc_kobject_alloc_port( ipc_kobject_type_t type, ipc_kobject_alloc_options_t options) { - ipc_port_t port = ipc_port_alloc_kernel(); + ipc_port_init_flags_t flags; + ipc_space_t space; + ipc_port_t port; + if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) { + /* kobject port intended to be copied out to user-space */ + flags = IPC_PORT_INIT_MESSAGE_QUEUE; + space = IS_NULL; + } else { + /* true kernel-bound kobject port */ + flags = IPC_PORT_INIT_NONE; + space = ipc_space_kernel; + } + port = ipc_port_alloc_special(space, flags); if (port == IP_NULL) { panic("ipc_kobject_alloc_port(): failed to allocate port"); } @@ -638,16 +650,29 @@ ipc_kobject_alloc_port( if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { ipc_port_make_send_locked(port); } - if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { - ipc_port_make_sonce_locked(port); - port->ip_nsrequest = port; - } - if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { - port->ip_no_grant = 1; + + if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) { + /* reset the port like it has been copied in circularity checked */ + if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { + panic("ipc_kobject_alloc_port(): invalid option for user-space port"); + } + port->ip_mscount = 0; + assert(port->ip_tempowner == 0); + assert(port->ip_receiver == IS_NULL); + port->ip_receiver = IS_NULL; + port->ip_receiver_name = MACH_PORT_NULL; + } else { + if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { + ipc_port_make_sonce_locked(port); + port->ip_nsrequest = port; + } } if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { port->ip_immovable_send = 1; } + if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { + port->ip_no_grant = 1; + } return port; } diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 4431f29ca..24913d602 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -174,6 +174,8 @@ __options_decl(ipc_kobject_alloc_options_t, uint32_t, { IPC_KOBJECT_ALLOC_NO_GRANT = 0x00000004, /* Make all the send rights immovable */ IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008, + /* Make the port in-transit from the get-go */ + IPC_KOBJECT_ALLOC_IN_TRANSIT = 0x00000010, }); /* Allocates a kobject port, never fails */ diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index e571487fa..7ff3981a7 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -664,7 +664,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi stackshotbuf_size = get_stackshot_estsize(size_hint); for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) { - if (kmem_alloc(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) { + if (kmem_alloc_flags(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG, KMA_ZERO) != KERN_SUCCESS) { error = KERN_RESOURCE_SHORTAGE; goto error_exit; } diff --git a/osfmk/kern/mk_timer.c b/osfmk/kern/mk_timer.c index 8de9c9012..e7780c2e3 100644 --- a/osfmk/kern/mk_timer.c +++ b/osfmk/kern/mk_timer.c @@ -73,29 +73,40 @@ mk_timer_create_trap( return MACH_PORT_NULL; } - result = mach_port_allocate_internal(myspace, MACH_PORT_RIGHT_RECEIVE, - &mk_timer_qos, &name); - if (result == KERN_SUCCESS) { - result = ipc_port_translate_receive(myspace, name, &port); - } - - if (result != KERN_SUCCESS) { + /* Pre-allocate a kmsg for the timer messages */ + ipc_kmsg_t kmsg; + kmsg = ipc_kmsg_prealloc(mk_timer_qos.len + MAX_TRAILER_SIZE); + if (kmsg == IKM_NULL) { zfree(mk_timer_zone, timer); - return MACH_PORT_NULL; } + /* Allocate an in-transit kobject port with a send right */ + ipc_kobject_alloc_options_t options; + options = (IPC_KOBJECT_ALLOC_IN_TRANSIT | IPC_KOBJECT_ALLOC_MAKE_SEND); + port = ipc_kobject_alloc_port((ipc_kobject_t)timer, IKOT_TIMER, options); + assert(port != IP_NULL); + + /* Associate the kmsg */ + ipc_kmsg_set_prealloc(kmsg, port); + + /* Initialize the timer object and bind port to it */ simple_lock_init(&timer->lock, 0); thread_call_setup(&timer->call_entry, mk_timer_expire, timer); timer->is_armed = timer->is_dead = FALSE; timer->active = 0; - timer->port = port; - ipc_kobject_set_atomically(port, (ipc_kobject_t)timer, IKOT_TIMER); - port->ip_srights++; - ip_reference(port); - ip_unlock(port); + /* Copyout the receive right for the timer port to user-space */ + current_thread()->ith_knote = ITH_KNOTE_NULL; + result = ipc_object_copyout(myspace, ip_to_object(port), + MACH_MSG_TYPE_MOVE_RECEIVE, + NULL, NULL, &name); + if (result != KERN_SUCCESS) { + ipc_object_destroy(ip_to_object(port), MACH_MSG_TYPE_MOVE_RECEIVE); + /* should trigger mk_timer_port_destroy() call */ + return MACH_PORT_NULL; + } return name; } diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 06e54544c..faac9b224 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -147,6 +147,10 @@ typedef enum { typedef enum { PSET_SMP, +#if __AMP__ + PSET_AMP_E, + PSET_AMP_P, +#endif } pset_cluster_type_t; typedef bitmap_t cpumap_t; diff --git a/osfmk/kern/sched_amp.c b/osfmk/kern/sched_amp.c new file mode 100644 index 000000000..50c381008 --- /dev/null +++ b/osfmk/kern/sched_amp.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __AMP__ + +static thread_t +sched_amp_steal_thread(processor_set_t pset); + +static void +sched_amp_thread_update_scan(sched_update_scan_context_t scan_context); + +static boolean_t +sched_amp_processor_enqueue(processor_t processor, thread_t thread, + sched_options_t options); + +static boolean_t +sched_amp_processor_queue_remove(processor_t processor, thread_t thread); + +static ast_t +sched_amp_processor_csw_check(processor_t processor); + +static boolean_t +sched_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte); + +static int +sched_amp_runq_count(processor_t processor); + +static boolean_t +sched_amp_processor_queue_empty(processor_t processor); + +static uint64_t +sched_amp_runq_stats_count_sum(processor_t processor); + +static int +sched_amp_processor_bound_count(processor_t processor); + +static void +sched_amp_pset_init(processor_set_t pset); + +static void +sched_amp_processor_init(processor_t processor); + +static thread_t +sched_amp_choose_thread(processor_t processor, int priority, ast_t reason); + +static void +sched_amp_processor_queue_shutdown(processor_t processor); + +static sched_mode_t +sched_amp_initial_thread_sched_mode(task_t parent_task); + +static processor_t +sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread); + +static bool +sched_amp_thread_avoid_processor(processor_t processor, thread_t thread); + +static bool +sched_amp_thread_should_yield(processor_t processor, thread_t thread); + +static void +sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation); + +const struct sched_dispatch_table sched_amp_dispatch = { + .sched_name = "amp", + .init = sched_amp_init, + .timebase_init = sched_timeshare_timebase_init, + .processor_init = sched_amp_processor_init, + .pset_init = sched_amp_pset_init, + .maintenance_continuation = sched_timeshare_maintenance_continue, + .choose_thread = sched_amp_choose_thread, + .steal_thread_enabled = sched_amp_steal_thread_enabled, + .steal_thread = sched_amp_steal_thread, + .compute_timeshare_priority = sched_compute_timeshare_priority, + .choose_processor = sched_amp_choose_processor, + .processor_enqueue = sched_amp_processor_enqueue, + .processor_queue_shutdown = sched_amp_processor_queue_shutdown, + .processor_queue_remove = sched_amp_processor_queue_remove, + .processor_queue_empty = sched_amp_processor_queue_empty, + .priority_is_urgent = priority_is_urgent, + .processor_csw_check = sched_amp_processor_csw_check, + .processor_queue_has_priority = sched_amp_processor_queue_has_priority, + .initial_quantum_size = sched_timeshare_initial_quantum_size, + .initial_thread_sched_mode = sched_amp_initial_thread_sched_mode, + .can_update_priority = can_update_priority, + .update_priority = update_priority, + .lightweight_update_priority = lightweight_update_priority, + .quantum_expire = sched_default_quantum_expire, + .processor_runq_count = sched_amp_runq_count, + .processor_runq_stats_count_sum = sched_amp_runq_stats_count_sum, + .processor_bound_count = sched_amp_processor_bound_count, + .thread_update_scan = sched_amp_thread_update_scan, + .multiple_psets_enabled = TRUE, + .sched_groups_enabled = FALSE, + .avoid_processor_enabled = TRUE, + .thread_avoid_processor = sched_amp_thread_avoid_processor, + .processor_balance = sched_amp_balance, + + .rt_runq = sched_amp_rt_runq, + .rt_init = sched_amp_rt_init, + .rt_queue_shutdown = sched_amp_rt_queue_shutdown, + .rt_runq_scan = sched_amp_rt_runq_scan, + .rt_runq_count_sum = sched_amp_rt_runq_count_sum, + + .qos_max_parallelism = sched_amp_qos_max_parallelism, + .check_spill = sched_amp_check_spill, + .ipi_policy = sched_amp_ipi_policy, + .thread_should_yield = sched_amp_thread_should_yield, + .run_count_incr = sched_run_incr, + .run_count_decr = sched_run_decr, + .update_thread_bucket = sched_update_thread_bucket, + .pset_made_schedulable = sched_pset_made_schedulable, + .thread_group_recommendation_change = sched_amp_thread_group_recommendation_change, +}; + +extern processor_set_t ecore_set; +extern processor_set_t pcore_set; + +__attribute__((always_inline)) +static inline run_queue_t +amp_main_runq(processor_t processor) +{ + return &processor->processor_set->pset_runq; +} + +__attribute__((always_inline)) +static inline run_queue_t +amp_bound_runq(processor_t processor) +{ + return &processor->runq; +} + +__attribute__((always_inline)) +static inline run_queue_t +amp_runq_for_thread(processor_t processor, thread_t thread) +{ + if (thread->bound_processor == PROCESSOR_NULL) { + return amp_main_runq(processor); + } else { + assert(thread->bound_processor == processor); + return amp_bound_runq(processor); + } +} + +static sched_mode_t +sched_amp_initial_thread_sched_mode(task_t parent_task) +{ + if (parent_task == kernel_task) { + return TH_MODE_FIXED; + } else { + return TH_MODE_TIMESHARE; + } +} + +static void +sched_amp_processor_init(processor_t processor) +{ + run_queue_init(&processor->runq); +} + +static void +sched_amp_pset_init(processor_set_t pset) +{ + run_queue_init(&pset->pset_runq); +} + +static thread_t +sched_amp_choose_thread( + processor_t processor, + int priority, + __unused ast_t reason) +{ + processor_set_t pset = processor->processor_set; + bool spill_pending = false; + int spill_pri = -1; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = pcore_set->pset_runq.highq; + } + + run_queue_t main_runq = amp_main_runq(processor); + run_queue_t bound_runq = amp_bound_runq(processor); + run_queue_t chosen_runq; + + if ((bound_runq->highq < priority) && + (main_runq->highq < priority) && + (spill_pri < priority)) { + return THREAD_NULL; + } + + if ((spill_pri > bound_runq->highq) && + (spill_pri > main_runq->highq)) { + /* + * There is a higher priority thread on the P-core runq, + * so returning THREAD_NULL here will cause thread_select() + * to call sched_amp_steal_thread() to try to get it. + */ + return THREAD_NULL; + } + + if (bound_runq->highq >= main_runq->highq) { + chosen_runq = bound_runq; + } else { + chosen_runq = main_runq; + } + + return run_queue_dequeue(chosen_runq, SCHED_HEADQ); +} + +static boolean_t +sched_amp_processor_enqueue( + processor_t processor, + thread_t thread, + sched_options_t options) +{ + run_queue_t rq = amp_runq_for_thread(processor, thread); + boolean_t result; + + result = run_queue_enqueue(rq, thread, options); + thread->runq = processor; + + return result; +} + +static boolean_t +sched_amp_processor_queue_empty(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id); + + return (amp_main_runq(processor)->count == 0) && + (amp_bound_runq(processor)->count == 0) && + !spill_pending; +} + +static bool +sched_amp_thread_should_yield(processor_t processor, thread_t thread) +{ + if (!sched_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) { + return true; + } + + if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) { + return pcore_set->pset_runq.count > 0; + } + + return false; +} + +static ast_t +sched_amp_processor_csw_check(processor_t processor) +{ + boolean_t has_higher; + int pri; + + run_queue_t main_runq = amp_main_runq(processor); + run_queue_t bound_runq = amp_bound_runq(processor); + + assert(processor->active_thread != NULL); + + processor_set_t pset = processor->processor_set; + bool spill_pending = false; + int spill_pri = -1; + int spill_urgency = 0; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = pcore_set->pset_runq.highq; + spill_urgency = pcore_set->pset_runq.urgency; + } + + pri = MAX(main_runq->highq, bound_runq->highq); + if (spill_pending) { + pri = MAX(pri, spill_pri); + } + + if (processor->first_timeslice) { + has_higher = (pri > processor->current_pri); + } else { + has_higher = (pri >= processor->current_pri); + } + + if (has_higher) { + if (main_runq->urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + if (bound_runq->urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + if (spill_urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static boolean_t +sched_amp_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte) +{ + bool spill_pending = false; + int spill_pri = -1; + processor_set_t pset = processor->processor_set; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = pcore_set->pset_runq.highq; + } + run_queue_t main_runq = amp_main_runq(processor); + run_queue_t bound_runq = amp_bound_runq(processor); + + int qpri = MAX(main_runq->highq, bound_runq->highq); + if (spill_pending) { + qpri = MAX(qpri, spill_pri); + } + + if (gte) { + return qpri >= priority; + } else { + return qpri > priority; + } +} + +static int +sched_amp_runq_count(processor_t processor) +{ + return amp_main_runq(processor)->count + amp_bound_runq(processor)->count; +} + +static uint64_t +sched_amp_runq_stats_count_sum(processor_t processor) +{ + uint64_t bound_sum = amp_bound_runq(processor)->runq_stats.count_sum; + + if (processor->cpu_id == processor->processor_set->cpu_set_low) { + return bound_sum + amp_main_runq(processor)->runq_stats.count_sum; + } else { + return bound_sum; + } +} +static int +sched_amp_processor_bound_count(processor_t processor) +{ + return amp_bound_runq(processor)->count; +} + +static void +sched_amp_processor_queue_shutdown(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + run_queue_t rq = amp_main_runq(processor); + thread_t thread; + queue_head_t tqueue; + + /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ + if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) { + pset_unlock(pset); + return; + } + + queue_init(&tqueue); + + while (rq->count > 0) { + thread = run_queue_dequeue(rq, SCHED_HEADQ); + enqueue_tail(&tqueue, &thread->runq_links); + } + + pset_unlock(pset); + + qe_foreach_element_safe(thread, &tqueue, runq_links) { + remqueue(&thread->runq_links); + + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +static boolean_t +sched_amp_processor_queue_remove( + processor_t processor, + thread_t thread) +{ + run_queue_t rq; + processor_set_t pset = processor->processor_set; + + pset_lock(pset); + + rq = amp_runq_for_thread(processor, thread); + + if (processor == thread->runq) { + /* + * Thread is on a run queue and we have a lock on + * that run queue. + */ + run_queue_remove(rq, thread); + } else { + /* + * The thread left the run queue before we could + * lock the run queue. + */ + assert(thread->runq == PROCESSOR_NULL); + processor = PROCESSOR_NULL; + } + + pset_unlock(pset); + + return processor != PROCESSOR_NULL; +} + +/* + * sched_amp_steal_thread() + * + */ +thread_t +sched_amp_steal_thread(processor_set_t pset) +{ + thread_t thread = THREAD_NULL; + processor_set_t nset = pset; + + assert(pset->pset_cluster_type != PSET_AMP_P); + + processor_t processor = current_processor(); + assert(pset == processor->processor_set); + + bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id); + bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id); + + nset = pcore_set; + + assert(nset != pset); + + if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) { + pset_unlock(pset); + + pset = nset; + + pset_lock(pset); + + /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */ + if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) && + (pset->pset_runq.count > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) && + (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) { + thread = run_queue_dequeue(&pset->pset_runq, SCHED_HEADQ); + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0); + sched_update_pset_load_average(pset); + } + } + + pset_unlock(pset); + return thread; +} + + + +static void +sched_amp_thread_update_scan(sched_update_scan_context_t scan_context) +{ + boolean_t restart_needed = FALSE; + processor_t processor = processor_list; + processor_set_t pset; + thread_t thread; + spl_t s; + + /* + * We update the threads associated with each processor (bound and idle threads) + * and then update the threads in each pset runqueue. + */ + + do { + do { + pset = processor->processor_set; + + s = splsched(); + pset_lock(pset); + + restart_needed = runq_scan(amp_bound_runq(processor), scan_context); + + pset_unlock(pset); + splx(s); + + if (restart_needed) { + break; + } + + thread = processor->idle_thread; + if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { + if (thread_update_add_thread(thread) == FALSE) { + restart_needed = TRUE; + break; + } + } + } while ((processor = processor->processor_list) != NULL); + + /* Ok, we now have a collection of candidates -- fix them. */ + thread_update_process_threads(); + } while (restart_needed); + + pset_node_t node = &pset_node0; + pset = node->psets; + + do { + do { + restart_needed = FALSE; + while (pset != NULL) { + s = splsched(); + pset_lock(pset); + + restart_needed = runq_scan(&pset->pset_runq, scan_context); + + pset_unlock(pset); + splx(s); + + if (restart_needed) { + break; + } + + pset = pset->pset_list; + } + + if (restart_needed) { + break; + } + } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); + + /* Ok, we now have a collection of candidates -- fix them. */ + thread_update_process_threads(); + } while (restart_needed); +} + +static bool +pcores_recommended(thread_t thread) +{ + if (pcore_set->online_processor_count == 0) { + /* No pcores available */ + return false; + } + + if (!pset_is_recommended(ecore_set)) { + /* No E cores recommended, must use P cores */ + return true; + } + + if (recommended_pset_type(thread) == PSET_AMP_E) { + return false; + } + + return pset_is_recommended(pcore_set); +} + +/* Return true if this thread should not continue running on this processor */ +static bool +sched_amp_thread_avoid_processor(processor_t processor, thread_t thread) +{ + if (processor->processor_set->pset_cluster_type == PSET_AMP_E) { + if (pcores_recommended(thread)) { + return true; + } + } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) { + if (!pcores_recommended(thread)) { + return true; + } + } + + return false; +} + +static processor_t +sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread) +{ + /* Bound threads don't call this function */ + assert(thread->bound_processor == PROCESSOR_NULL); + + processor_set_t nset = pset; + bool choose_pcores; + +again: + choose_pcores = pcores_recommended(thread); + + if (choose_pcores && (pset->pset_cluster_type != PSET_AMP_P)) { + nset = pcore_set; + assert(nset != NULL); + } else if (!choose_pcores && (pset->pset_cluster_type != PSET_AMP_E)) { + nset = ecore_set; + assert(nset != NULL); + } + + if (nset != pset) { + pset_unlock(pset); + pset_lock(nset); + } + + /* Now that the chosen pset is definitely locked, make sure nothing important has changed */ + if (!pset_is_recommended(nset)) { + pset = nset; + goto again; + } + + return choose_processor(nset, processor, thread); +} + +void +sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation) +{ + thread_group_update_recommendation(tg, new_recommendation); + + if (new_recommendation != CLUSTER_TYPE_P) { + return; + } + + sched_amp_bounce_thread_group_from_ecores(ecore_set, tg); +} + +#if DEVELOPMENT || DEBUG +extern int32_t sysctl_get_bound_cpuid(void); +int32_t +sysctl_get_bound_cpuid(void) +{ + int32_t cpuid = -1; + thread_t self = current_thread(); + + processor_t processor = self->bound_processor; + if (processor == NULL) { + cpuid = -1; + } else { + cpuid = processor->cpu_id; + } + + return cpuid; +} + +extern void sysctl_thread_bind_cpuid(int32_t cpuid); +void +sysctl_thread_bind_cpuid(int32_t cpuid) +{ + if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) { + return; + } + + processor_t processor = processor_array[cpuid]; + if (processor == PROCESSOR_NULL) { + return; + } + + thread_bind(processor); + + thread_block(THREAD_CONTINUE_NULL); +} + +extern char sysctl_get_bound_cluster_type(void); +char +sysctl_get_bound_cluster_type(void) +{ + thread_t self = current_thread(); + + if (self->sched_flags & TH_SFLAG_ECORE_ONLY) { + return 'E'; + } else if (self->sched_flags & TH_SFLAG_PCORE_ONLY) { + return 'P'; + } + + return '0'; +} + +extern void sysctl_thread_bind_cluster_type(char cluster_type); +void +sysctl_thread_bind_cluster_type(char cluster_type) +{ + thread_bind_cluster_type(cluster_type); +} + +extern char sysctl_get_task_cluster_type(void); +char +sysctl_get_task_cluster_type(void) +{ + thread_t thread = current_thread(); + task_t task = thread->task; + + if (task->pset_hint == ecore_set) { + return 'E'; + } else if (task->pset_hint == pcore_set) { + return 'P'; + } + + return '0'; +} + +extern void sysctl_task_set_cluster_type(char cluster_type); +void +sysctl_task_set_cluster_type(char cluster_type) +{ + thread_t thread = current_thread(); + task_t task = thread->task; + + switch (cluster_type) { + case 'e': + case 'E': + task->pset_hint = ecore_set; + break; + case 'p': + case 'P': + task->pset_hint = pcore_set; + break; + default: + break; + } + + thread_block(THREAD_CONTINUE_NULL); +} +#endif + +#endif diff --git a/osfmk/kern/sched_amp_common.c b/osfmk/kern/sched_amp_common.c new file mode 100644 index 000000000..1158090d9 --- /dev/null +++ b/osfmk/kern/sched_amp_common.c @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __AMP__ + +/* Exported globals */ +processor_set_t ecore_set = NULL; +processor_set_t pcore_set = NULL; + +static struct processor_set pset1; +static struct pset_node pset_node1; + +#if DEVELOPMENT || DEBUG +bool system_ecore_only = false; +#endif /* DEVELOPMENT || DEBUG */ + +/* + * sched_amp_init() + * + * Initialize the pcore_set and ecore_set globals which describe the + * P/E processor sets. + */ +void +sched_amp_init(void) +{ + pset_init(&pset1, &pset_node1); + pset_node1.psets = &pset1; + pset_node0.node_list = &pset_node1; + + if (ml_get_boot_cluster() == CLUSTER_TYPE_P) { + pcore_set = &pset0; + ecore_set = &pset1; + } else { + ecore_set = &pset0; + pcore_set = &pset1; + } + + ecore_set->pset_cluster_type = PSET_AMP_E; + ecore_set->pset_cluster_id = 0; + + pcore_set->pset_cluster_type = PSET_AMP_P; + pcore_set->pset_cluster_id = 1; + +#if !CONFIG_SCHED_CLUTCH + /* + * For non-clutch scheduler, allow system to be e-core only. + * Clutch scheduler support for this feature needs to be implemented. + */ +#if DEVELOPMENT || DEBUG + if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) { + system_ecore_only = true; + } +#endif /* DEVELOPMENT || DEBUG */ + +#endif /* !CONFIG_SCHED_CLUTCH */ + sched_timeshare_init(); +} + +/* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */ +int sched_amp_spill_count = 3; +int sched_amp_idle_steal = 1; +int sched_amp_spill_steal = 1; + +/* + * We see performance gains from doing immediate IPIs to P-cores to run + * P-eligible threads and lesser P-E migrations from using deferred IPIs + * for spill. + */ +int sched_amp_spill_deferred_ipi = 1; +int sched_amp_pcores_preempt_immediate_ipi = 1; + + +/* + * sched_amp_spill_threshold() + * + * Routine to calulate spill threshold which decides if cluster should spill. + */ +int +sched_amp_spill_threshold(processor_set_t pset) +{ + int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask); + + return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count; +} + +/* + * pset_signal_spill() + * + * Routine to signal a running/idle CPU to cause a spill onto that CPU. + * Called with pset locked, returns unlocked + */ +void +pset_signal_spill(processor_set_t pset, int spilled_thread_priority) +{ + processor_t processor; + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + + uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]; + for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) { + processor = processor_array[cpuid]; + if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0); + + processor->deadline = UINT64_MAX; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); + + if (processor == current_processor()) { + bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id); + } else { + ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL); + } + pset_unlock(pset); + sched_ipi_perform(processor, ipi_type); + return; + } + } + + processor_t ast_processor = NULL; + uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]; + for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { + processor = processor_array[cpuid]; + if (processor->current_recommended_pset_type == PSET_AMP_P) { + /* Already running a spilled P-core recommended thread */ + continue; + } + if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + /* Already received a spill signal */ + continue; + } + if (processor->current_pri >= spilled_thread_priority) { + /* Already running a higher or equal priority thread */ + continue; + } + + /* Found a suitable processor */ + bit_set(pset->pending_spill_cpu_mask, processor->cpu_id); + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0); + if (processor == current_processor()) { + ast_on(AST_PREEMPT); + } + ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL); + if (ipi_type != SCHED_IPI_NONE) { + ast_processor = processor; + } + break; + } + + pset_unlock(pset); + sched_ipi_perform(ast_processor, ipi_type); +} + +/* + * pset_should_accept_spilled_thread() + * + * Routine to decide if pset should accept spilled threads. + * This function must be safe to call (to use as a hint) without holding the pset lock. + */ +bool +pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority) +{ + if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { + return true; + } + + uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]); + + for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->current_recommended_pset_type == PSET_AMP_P) { + /* This processor is already running a spilled thread */ + continue; + } + + if (processor->current_pri < spilled_thread_priority) { + return true; + } + } + + return false; +} + +/* + * should_spill_to_ecores() + * + * Spill policy is implemented here + */ +bool +should_spill_to_ecores(processor_set_t nset, thread_t thread) +{ + if (nset->pset_cluster_type == PSET_AMP_E) { + /* Not relevant if ecores already preferred */ + return false; + } + + if (!pset_is_recommended(ecore_set)) { + /* E cores must be recommended */ + return false; + } + +#if !CONFIG_SCHED_CLUTCH + /* Per-thread P-core scheduling support needs to be implemented for clutch scheduler */ + if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) { + return false; + } +#endif /* !CONFIG_SCHED_CLUTCH */ + + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + /* Never spill realtime threads */ + return false; + } + + if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) { + /* Don't spill if idle cores */ + return false; + } + + if ((sched_get_pset_load_average(nset) >= sched_amp_spill_threshold(nset)) && /* There is already a load on P cores */ + pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */ + return true; + } + + return false; +} + +/* + * sched_amp_check_spill() + * + * Routine to check if the thread should be spilled and signal the pset if needed. + */ +void +sched_amp_check_spill(processor_set_t pset, thread_t thread) +{ + /* pset is unlocked */ + + /* Bound threads don't call this function */ + assert(thread->bound_processor == PROCESSOR_NULL); + + if (should_spill_to_ecores(pset, thread)) { + pset_lock(ecore_set); + + pset_signal_spill(ecore_set, thread->sched_pri); + /* returns with ecore_set unlocked */ + } +} + +/* + * sched_amp_steal_threshold() + * + * Routine to calculate the steal threshold + */ +int +sched_amp_steal_threshold(processor_set_t pset, bool spill_pending) +{ + int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask); + + return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal); +} + +/* + * sched_amp_steal_thread_enabled() + * + */ +bool +sched_amp_steal_thread_enabled(processor_set_t pset) +{ + return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0); +} + +/* + * sched_amp_balance() + * + * Invoked with pset locked, returns with pset unlocked + */ +void +sched_amp_balance(processor_t cprocessor, processor_set_t cpset) +{ + assert(cprocessor == current_processor()); + + pset_unlock(cpset); + + if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) { + return; + } + + /* + * cprocessor is an idle, recommended P core processor. + * Look for P-eligible threads that have spilled to an E core + * and coax them to come back. + */ + + processor_set_t pset = ecore_set; + + pset_lock(pset); + + processor_t eprocessor; + uint64_t ast_processor_map = 0; + + sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE}; + uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING]; + for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { + eprocessor = processor_array[cpuid]; + if ((eprocessor->current_pri < BASEPRI_RTQUEUES) && + (eprocessor->current_recommended_pset_type == PSET_AMP_P)) { + ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE); + if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) { + bit_set(ast_processor_map, eprocessor->cpu_id); + assert(eprocessor != cprocessor); + } + } + } + + pset_unlock(pset); + + for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) { + processor_t ast_processor = processor_array[cpuid]; + sched_ipi_perform(ast_processor, ipi_type[cpuid]); + } +} + +/* + * Helper function for sched_amp_thread_group_recommendation_change() + * Find all the cores in the pset running threads from the thread_group tg + * and send them a rebalance interrupt. + */ +void +sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg) +{ + assert(pset->pset_cluster_type == PSET_AMP_E); + uint64_t ast_processor_map = 0; + sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE}; + + spl_t s = splsched(); + pset_lock(pset); + + uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING]; + for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { + processor_t eprocessor = processor_array[cpuid]; + if (eprocessor->current_thread_group == tg) { + ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE); + if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) { + bit_set(ast_processor_map, eprocessor->cpu_id); + } else if (eprocessor == current_processor()) { + ast_on(AST_PREEMPT); + bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id); + } + } + } + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0); + + pset_unlock(pset); + + for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) { + processor_t ast_processor = processor_array[cpuid]; + sched_ipi_perform(ast_processor, ipi_type[cpuid]); + } + + splx(s); +} + +/* + * sched_amp_ipi_policy() + */ +sched_ipi_type_t +sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) +{ + processor_set_t pset = dst->processor_set; + assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false); + assert(dst != current_processor()); + + boolean_t deferred_ipi_supported = false; +#if defined(CONFIG_SCHED_DEFERRED_AST) + deferred_ipi_supported = true; +#endif /* CONFIG_SCHED_DEFERRED_AST */ + + switch (event) { + case SCHED_IPI_EVENT_SPILL: + /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */ + if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) { + return sched_ipi_deferred_policy(pset, dst, event); + } + break; + case SCHED_IPI_EVENT_PREEMPT: + /* For preemption, the default policy is to use deferred IPIs + * for Non-RT P-core preemption. Override that behavior if + * sched_amp_pcores_preempt_immediate_ipi is set + */ + if (thread && thread->sched_pri < BASEPRI_RTQUEUES) { + if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) { + return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; + } + } + break; + default: + break; + } + /* Default back to the global policy for all other scenarios */ + return sched_ipi_policy(dst, thread, dst_idle, event); +} + +/* + * sched_amp_qos_max_parallelism() + */ +uint32_t +sched_amp_qos_max_parallelism(int qos, uint64_t options) +{ + uint32_t ecount = ecore_set->cpu_set_count; + uint32_t pcount = pcore_set->cpu_set_count; + + if (options & QOS_PARALLELISM_REALTIME) { + /* For realtime threads on AMP, we would want them + * to limit the width to just the P-cores since we + * do not spill/rebalance for RT threads. + */ + return pcount; + } + + /* + * The current AMP scheduler policy is not run + * background and utility threads on the P-Cores. + */ + switch (qos) { + case THREAD_QOS_UTILITY: + case THREAD_QOS_BACKGROUND: + case THREAD_QOS_MAINTENANCE: + return ecount; + default: + return ecount + pcount; + } +} + +/* + * sched_amp_rt_runq() + */ +rt_queue_t +sched_amp_rt_runq(processor_set_t pset) +{ + return &pset->rt_runq; +} + +/* + * sched_amp_rt_init() + */ +void +sched_amp_rt_init(processor_set_t pset) +{ + pset_rt_init(pset); +} + +/* + * sched_amp_rt_queue_shutdown() + */ +void +sched_amp_rt_queue_shutdown(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + thread_t thread; + queue_head_t tqueue; + + pset_lock(pset); + + /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ + if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) { + pset_unlock(pset); + return; + } + + queue_init(&tqueue); + + rt_lock_lock(pset); + + while (rt_runq_count(pset) > 0) { + thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links); + thread->runq = PROCESSOR_NULL; + SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats, pset->rt_runq.count); + rt_runq_count_decr(pset); + enqueue_tail(&tqueue, &thread->runq_links); + } + rt_lock_unlock(pset); + sched_update_pset_load_average(pset); + pset_unlock(pset); + + qe_foreach_element_safe(thread, &tqueue, runq_links) { + remqueue(&thread->runq_links); + + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +/* + * sched_amp_rt_runq_scan() + * + * Assumes RT lock is not held, and acquires splsched/rt_lock itself + */ +void +sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context) +{ + thread_t thread; + + pset_node_t node = &pset_node0; + processor_set_t pset = node->psets; + + spl_t s = splsched(); + do { + while (pset != NULL) { + rt_lock_lock(pset); + + qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) { + if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { + scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; + } + } + + rt_lock_unlock(pset); + + pset = pset->pset_list; + } + } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); + splx(s); +} + +/* + * sched_amp_rt_runq_count_sum() + */ +int64_t +sched_amp_rt_runq_count_sum(void) +{ + pset_node_t node = &pset_node0; + processor_set_t pset = node->psets; + int64_t count = 0; + + do { + while (pset != NULL) { + count += pset->rt_runq.runq_stats.count_sum; + + pset = pset->pset_list; + } + } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); + + return count; +} + +#endif /* __AMP__ */ diff --git a/osfmk/kern/sched_amp_common.h b/osfmk/kern/sched_amp_common.h new file mode 100644 index 000000000..e29cf07f3 --- /dev/null +++ b/osfmk/kern/sched_amp_common.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SCHED_AMP_COMMON_H_ +#define _KERN_SCHED_AMP_COMMON_H_ + +#if __AMP__ + +/* Routine to initialize processor sets on AMP platforms */ +void sched_amp_init(void); + +/* + * The AMP scheduler uses spill/steal/rebalance logic to make sure the most appropriate threads + * are scheduled on the P/E clusters. Here are the definitions of those terms: + * + * - Spill: Spill threads from an overcommited P-cluster onto the E-cluster. This is needed to make sure + * that high priority P-recommended threads experience low scheduling latency in the presence of + * lots of P-recommended threads. + * + * - Steal: From an E-core, steal a thread from the P-cluster to provide low scheduling latency for + * P-recommended threads. + * + * - Rebalance: Once a P-core goes idle, check if the E-cores are running any P-recommended threads and + * bring it back to run on its recommended cluster type. + */ + +/* Spill logic */ +int sched_amp_spill_threshold(processor_set_t pset); +void pset_signal_spill(processor_set_t pset, int spilled_thread_priority); +bool pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority); +bool should_spill_to_ecores(processor_set_t nset, thread_t thread); +void sched_amp_check_spill(processor_set_t pset, thread_t thread); + +/* Steal logic */ +int sched_amp_steal_threshold(processor_set_t pset, bool spill_pending); +bool sched_amp_steal_thread_enabled(processor_set_t pset); + +/* Rebalance logic */ +void sched_amp_balance(processor_t cprocessor, processor_set_t cpset); + +/* IPI policy */ +sched_ipi_type_t sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event); + +/* AMP realtime runq management */ +rt_queue_t sched_amp_rt_runq(processor_set_t pset); +void sched_amp_rt_init(processor_set_t pset); +void sched_amp_rt_queue_shutdown(processor_t processor); +void sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context); +int64_t sched_amp_rt_runq_count_sum(void); + +uint32_t sched_amp_qos_max_parallelism(int qos, uint64_t options); +void sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg); + +#endif /* __AMP__ */ + +#endif /* _KERN_SCHED_AMP_COMMON_H_ */ diff --git a/osfmk/kern/sched_clutch.c b/osfmk/kern/sched_clutch.c index 7a246a05e..d8a808f60 100644 --- a/osfmk/kern/sched_clutch.c +++ b/osfmk/kern/sched_clutch.c @@ -46,6 +46,9 @@ #include #include +#if __AMP__ +#include +#endif /* __AMP__ */ #if CONFIG_SCHED_CLUTCH @@ -92,6 +95,10 @@ static uint32_t sched_clutch_root_urgency(sched_clutch_root_t); static uint32_t sched_clutch_root_count_sum(sched_clutch_root_t); static int sched_clutch_root_priority(sched_clutch_root_t); +#if __AMP__ +/* System based routines */ +static bool sched_clutch_pset_available(processor_set_t); +#endif /* __AMP__ */ /* Helper debugging routines */ static inline void sched_clutch_hierarchy_locked_assert(sched_clutch_root_t); @@ -250,6 +257,30 @@ sched_clutch_thr_count_dec( } } +#if __AMP__ + +/* + * sched_clutch_pset_available() + * + * Routine to determine if a pset is available for scheduling. + */ +static bool +sched_clutch_pset_available(processor_set_t pset) +{ + /* Check if cluster has none of the CPUs available */ + if (pset->online_processor_count == 0) { + return false; + } + + /* Check if the cluster is not recommended by CLPC */ + if (!pset_is_recommended(pset)) { + return false; + } + + return true; +} + +#endif /* __AMP__ */ /* * sched_clutch_root_init() @@ -748,6 +779,34 @@ sched_clutch_destroy( assert(os_atomic_load(&clutch->sc_thr_count, relaxed) == 0); } +#if __AMP__ + +/* + * sched_clutch_bucket_foreign() + * + * Identifies if the clutch bucket is a foreign (not recommended for) this + * hierarchy. This is possible due to the recommended hierarchy/pset not + * available for scheduling currently. + */ +static boolean_t +sched_clutch_bucket_foreign(sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket) +{ + assert(clutch_bucket->scb_thr_count > 0); + if (!sched_clutch_pset_available(root_clutch->scr_pset)) { + /* Even though the pset was not available for scheduling, threads + * are being put in its runq (this might be due to the other pset + * being turned off and this being the master processor pset). + * Mark the clutch bucket as foreign so that when the other + * pset becomes available, it moves the clutch bucket accordingly. + */ + return true; + } + thread_t thread = run_queue_peek(&clutch_bucket->scb_runq); + pset_cluster_type_t pset_type = recommended_pset_type(thread); + return pset_type != root_clutch->scr_pset->pset_cluster_type; +} + +#endif /* __AMP__ */ /* * sched_clutch_bucket_hierarchy_insert() @@ -766,6 +825,13 @@ sched_clutch_bucket_hierarchy_insert( /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */ enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink); } +#if __AMP__ + /* Check if the bucket is a foreign clutch bucket and add it to the foreign buckets list */ + if (sched_clutch_bucket_foreign(root_clutch, clutch_bucket)) { + clutch_bucket->scb_foreign = true; + enqueue_tail(&root_clutch->scr_foreign_buckets, &clutch_bucket->scb_foreignlink); + } +#endif /* __AMP__ */ sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */ @@ -797,6 +863,12 @@ sched_clutch_bucket_hierarchy_remove( /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */ remqueue(&clutch_bucket->scb_listlink); } +#if __AMP__ + if (clutch_bucket->scb_foreign) { + clutch_bucket->scb_foreign = false; + remqueue(&clutch_bucket->scb_foreignlink); + } +#endif /* __AMP__ */ sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; @@ -2170,5 +2242,655 @@ sched_clutch_update_thread_bucket(thread_t thread) } } +#if __AMP__ + +/* Implementation of the AMP version of the clutch scheduler */ + +static thread_t +sched_clutch_amp_steal_thread(processor_set_t pset); + +static ast_t +sched_clutch_amp_processor_csw_check(processor_t processor); + +static boolean_t +sched_clutch_amp_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte); + +static boolean_t +sched_clutch_amp_processor_queue_empty(processor_t processor); + +static thread_t +sched_clutch_amp_choose_thread(processor_t processor, int priority, ast_t reason); + +static void +sched_clutch_amp_processor_queue_shutdown(processor_t processor); + +static processor_t +sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread); + +static bool +sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread); + +static bool +sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread); + +static void +sched_clutch_migrate_foreign_buckets(processor_t processor, processor_set_t dst_pset, boolean_t drop_lock); + +static void +sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation); + +const struct sched_dispatch_table sched_clutch_amp_dispatch = { + .sched_name = "clutch_amp", + .init = sched_amp_init, + .timebase_init = sched_clutch_timebase_init, + .processor_init = sched_clutch_processor_init, + .pset_init = sched_clutch_pset_init, + .maintenance_continuation = sched_timeshare_maintenance_continue, + .choose_thread = sched_clutch_amp_choose_thread, + .steal_thread_enabled = sched_amp_steal_thread_enabled, + .steal_thread = sched_clutch_amp_steal_thread, + .compute_timeshare_priority = sched_compute_timeshare_priority, + .choose_processor = sched_clutch_amp_choose_processor, + .processor_enqueue = sched_clutch_processor_enqueue, + .processor_queue_shutdown = sched_clutch_amp_processor_queue_shutdown, + .processor_queue_remove = sched_clutch_processor_queue_remove, + .processor_queue_empty = sched_clutch_amp_processor_queue_empty, + .priority_is_urgent = priority_is_urgent, + .processor_csw_check = sched_clutch_amp_processor_csw_check, + .processor_queue_has_priority = sched_clutch_amp_processor_queue_has_priority, + .initial_quantum_size = sched_clutch_initial_quantum_size, + .initial_thread_sched_mode = sched_clutch_initial_thread_sched_mode, + .can_update_priority = can_update_priority, + .update_priority = update_priority, + .lightweight_update_priority = lightweight_update_priority, + .quantum_expire = sched_default_quantum_expire, + .processor_runq_count = sched_clutch_runq_count, + .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum, + .processor_bound_count = sched_clutch_processor_bound_count, + .thread_update_scan = sched_clutch_thread_update_scan, + .multiple_psets_enabled = TRUE, + .sched_groups_enabled = FALSE, + .avoid_processor_enabled = TRUE, + .thread_avoid_processor = sched_clutch_amp_thread_avoid_processor, + .processor_balance = sched_amp_balance, + + .rt_runq = sched_amp_rt_runq, + .rt_init = sched_amp_rt_init, + .rt_queue_shutdown = sched_amp_rt_queue_shutdown, + .rt_runq_scan = sched_amp_rt_runq_scan, + .rt_runq_count_sum = sched_amp_rt_runq_count_sum, + + .qos_max_parallelism = sched_amp_qos_max_parallelism, + .check_spill = sched_amp_check_spill, + .ipi_policy = sched_amp_ipi_policy, + .thread_should_yield = sched_clutch_amp_thread_should_yield, + .run_count_incr = sched_clutch_run_incr, + .run_count_decr = sched_clutch_run_decr, + .update_thread_bucket = sched_clutch_update_thread_bucket, + .pset_made_schedulable = sched_clutch_migrate_foreign_buckets, + .thread_group_recommendation_change = sched_clutch_amp_thread_group_recommendation_change, +}; + +extern processor_set_t ecore_set; +extern processor_set_t pcore_set; + +static thread_t +sched_clutch_amp_choose_thread( + processor_t processor, + int priority, + __unused ast_t reason) +{ + processor_set_t pset = processor->processor_set; + bool spill_pending = false; + int spill_pri = -1; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root); + } + + int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)); + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + boolean_t choose_from_boundq = false; + + if ((bound_runq->highq < priority) && + (clutch_pri < priority) && + (spill_pri < priority)) { + return THREAD_NULL; + } + + if ((spill_pri > bound_runq->highq) && + (spill_pri > clutch_pri)) { + /* + * There is a higher priority thread on the P-core runq, + * so returning THREAD_NULL here will cause thread_select() + * to call sched_clutch_amp_steal_thread() to try to get it. + */ + return THREAD_NULL; + } + + if (bound_runq->highq >= clutch_pri) { + choose_from_boundq = true; + } + + thread_t thread = THREAD_NULL; + if (choose_from_boundq == false) { + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + thread = sched_clutch_thread_highest(pset_clutch_root); + } else { + thread = run_queue_dequeue(bound_runq, SCHED_HEADQ); + } + return thread; +} + +static boolean_t +sched_clutch_amp_processor_queue_empty(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id); + + return (sched_clutch_root_count(sched_clutch_processor_root_clutch(processor)) == 0) && + (sched_clutch_bound_runq(processor)->count == 0) && + !spill_pending; +} + +static bool +sched_clutch_amp_thread_should_yield(processor_t processor, thread_t thread) +{ + if (!sched_clutch_amp_processor_queue_empty(processor) || (rt_runq_count(processor->processor_set) > 0)) { + return true; + } + + if ((processor->processor_set->pset_cluster_type == PSET_AMP_E) && (recommended_pset_type(thread) == PSET_AMP_P)) { + return sched_clutch_root_count(&pcore_set->pset_clutch_root) > 0; + } + + return false; +} + +static ast_t +sched_clutch_amp_processor_csw_check(processor_t processor) +{ + boolean_t has_higher; + int pri; + + int clutch_pri = sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)); + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + + assert(processor->active_thread != NULL); + + processor_set_t pset = processor->processor_set; + bool spill_pending = false; + int spill_pri = -1; + int spill_urgency = 0; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root); + spill_urgency = sched_clutch_root_urgency(&pcore_set->pset_clutch_root); + } + + pri = MAX(clutch_pri, bound_runq->highq); + if (spill_pending) { + pri = MAX(pri, spill_pri); + } + + if (processor->first_timeslice) { + has_higher = (pri > processor->current_pri); + } else { + has_higher = (pri >= processor->current_pri); + } + + if (has_higher) { + if (sched_clutch_root_urgency(sched_clutch_processor_root_clutch(processor)) > 0) { + return AST_PREEMPT | AST_URGENT; + } + + if (bound_runq->urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + if (spill_urgency > 0) { + return AST_PREEMPT | AST_URGENT; + } + + return AST_PREEMPT; + } + + return AST_NONE; +} + +static boolean_t +sched_clutch_amp_processor_queue_has_priority(processor_t processor, + int priority, + boolean_t gte) +{ + bool spill_pending = false; + int spill_pri = -1; + processor_set_t pset = processor->processor_set; + + if (pset == ecore_set && bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { + spill_pending = true; + spill_pri = sched_clutch_root_priority(&pcore_set->pset_clutch_root); + } + run_queue_t bound_runq = sched_clutch_bound_runq(processor); + + int qpri = MAX(sched_clutch_root_priority(sched_clutch_processor_root_clutch(processor)), bound_runq->highq); + if (spill_pending) { + qpri = MAX(qpri, spill_pri); + } + + if (gte) { + return qpri >= priority; + } else { + return qpri > priority; + } +} + +/* + * sched_clutch_hierarchy_thread_pset() + * + * Routine to determine where a thread should be enqueued based on its + * recommendation if this is the first runnable thread in the clutch_bucket + * or its clutch bucket's hierarchy membership. + */ +static processor_set_t +sched_clutch_hierarchy_thread_pset(thread_t thread) +{ + if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread) == false) { + return (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_set : ecore_set; + } + + sched_clutch_t clutch = sched_clutch_for_thread(thread); + sched_clutch_bucket_t clutch_bucket = &(clutch->sc_clutch_buckets[thread->th_sched_bucket]); + sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed); + if (scb_root) { + /* Clutch bucket is already runnable, return the pset hierarchy its part of */ + return scb_root->scr_pset; + } + return (recommended_pset_type(thread) == PSET_AMP_E) ? ecore_set : pcore_set; +} + +/* + * sched_clutch_thread_pset_recommended() + * + * Routine to determine if the thread should be placed on the provided pset. + * The routine first makes sure the cluster is available for scheduling. If + * it is available, it looks at the thread's recommendation. Called + * with the pset lock held. + */ +static bool +sched_clutch_thread_pset_recommended(thread_t thread, processor_set_t pset) +{ + if (!sched_clutch_pset_available(pset)) { + return false; + } + + /* At this point, all clusters should be available and recommended */ + if (sched_clutch_hierarchy_thread_pset(thread) != pset) { + return false; + } + + return true; +} + + +static void +sched_clutch_amp_processor_queue_shutdown(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); + thread_t thread; + queue_head_t tqueue; + + /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ + if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) { + pset_unlock(pset); + return; + } + + queue_init(&tqueue); + while (sched_clutch_root_count(pset_clutch_root) > 0) { + thread = sched_clutch_thread_highest(pset_clutch_root); + enqueue_tail(&tqueue, &thread->runq_links); + } + pset_unlock(pset); + + qe_foreach_element_safe(thread, &tqueue, runq_links) { + remqueue(&thread->runq_links); + thread_lock(thread); + thread_setrun(thread, SCHED_TAILQ); + thread_unlock(thread); + } +} + +static thread_t +sched_clutch_amp_steal_thread(processor_set_t pset) +{ + thread_t thread = THREAD_NULL; + processor_set_t nset = pset; + + if (pcore_set->online_processor_count == 0) { + /* Nothing to steal from */ + goto out; + } + + if (pset->pset_cluster_type == PSET_AMP_P) { + /* P cores don't steal from E cores */ + goto out; + } + + processor_t processor = current_processor(); + assert(pset == processor->processor_set); + + bool spill_pending = bit_test(pset->pending_spill_cpu_mask, processor->cpu_id); + bit_clear(pset->pending_spill_cpu_mask, processor->cpu_id); + + nset = pcore_set; + + assert(nset != pset); + + if (sched_get_pset_load_average(nset) >= sched_amp_steal_threshold(nset, spill_pending)) { + pset_unlock(pset); + + pset = nset; + + pset_lock(pset); + + /* Allow steal if load average still OK, no idle cores, and more threads on runq than active cores DISPATCHING */ + if ((sched_get_pset_load_average(pset) >= sched_amp_steal_threshold(pset, spill_pending)) && + ((int)sched_clutch_root_count(&pset->pset_clutch_root) > bit_count(pset->cpu_state_map[PROCESSOR_DISPATCHING])) && + (bit_count(pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) == 0)) { + thread = sched_clutch_thread_highest(&pset->pset_clutch_root); + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_STEAL) | DBG_FUNC_NONE, spill_pending, 0, 0, 0); + sched_update_pset_load_average(pset); + } + } + +out: + pset_unlock(pset); + return thread; +} + +/* Return true if this thread should not continue running on this processor */ +static bool +sched_clutch_amp_thread_avoid_processor(processor_t processor, thread_t thread) +{ + if (processor->processor_set->pset_cluster_type == PSET_AMP_E) { + if (sched_clutch_thread_pset_recommended(thread, pcore_set)) { + return true; + } + } else if (processor->processor_set->pset_cluster_type == PSET_AMP_P) { + if (!sched_clutch_thread_pset_recommended(thread, pcore_set)) { + return true; + } + } + + return false; +} + +static processor_t +sched_clutch_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread) +{ + /* Bound threads don't call this function */ + assert(thread->bound_processor == PROCESSOR_NULL); + + processor_set_t nset; + processor_t chosen_processor = PROCESSOR_NULL; + +select_pset: + nset = (pset == ecore_set) ? pcore_set : ecore_set; + if (!sched_clutch_pset_available(pset)) { + /* If the current pset is not available for scheduling, just use the other pset */ + pset_unlock(pset); + pset_lock(nset); + goto select_processor; + } + + /* Check if the thread is recommended to run on this pset */ + if (sched_clutch_thread_pset_recommended(thread, pset)) { + nset = pset; + goto select_processor; + } else { + /* pset not recommended; try the other pset */ + pset_unlock(pset); + pset_lock(nset); + pset = nset; + goto select_pset; + } + +select_processor: + if (!sched_clutch_pset_available(nset)) { + /* + * It looks like both psets are not available due to some + * reason. In that case, just use the master processor's + * pset for scheduling. + */ + if (master_processor->processor_set != nset) { + pset_unlock(nset); + nset = master_processor->processor_set; + pset_lock(nset); + } + } + chosen_processor = choose_processor(nset, processor, thread); + assert(chosen_processor->processor_set == nset); + return chosen_processor; +} + +/* + * AMP Clutch Scheduler Thread Migration + * + * For the AMP version of the clutch scheduler the thread is always scheduled via its + * thread group. So it is important to make sure that the thread group is part of the + * correct processor set hierarchy. In order to do that, the clutch scheduler moves + * all eligble clutch buckets to the correct hierarchy when the recommendation of a + * thread group is changed by CLPC. + */ + +/* + * sched_clutch_recommended_pset() + * + * Routine to decide which hierarchy the thread group should be in based on the + * recommendation and other thread group and system properties. This routine is + * used to determine if thread group migration is necessary and should mimic the + * logic in sched_clutch_thread_pset_recommended() & recommended_pset_type(). + */ +static processor_set_t +sched_clutch_recommended_pset(sched_clutch_t sched_clutch, cluster_type_t recommendation) +{ + if (!sched_clutch_pset_available(pcore_set)) { + return ecore_set; + } + + if (!sched_clutch_pset_available(ecore_set)) { + return pcore_set; + } + + /* + * If all clusters are available and recommended, use the recommendation + * to decide which cluster to use. + */ + pset_cluster_type_t type = thread_group_pset_recommendation(sched_clutch->sc_tg, recommendation); + return (type == PSET_AMP_E) ? ecore_set : pcore_set; +} + +static void +sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, queue_t clutch_threads) +{ + uint16_t thread_count = clutch_bucket->scb_thr_count; + thread_t thread; + uint64_t current_timestamp = mach_approximate_time(); + while (thread_count > 0) { + thread = run_queue_peek(&clutch_bucket->scb_runq); + sched_clutch_thread_remove(root_clutch, thread, current_timestamp); + enqueue_tail(clutch_threads, &thread->runq_links); + thread_count--; + } + + /* + * This operation should have drained the clutch bucket and pulled it out of the + * hierarchy. + */ + assert(clutch_bucket->scb_thr_count == 0); + assert(clutch_bucket->scb_root == NULL); +} + +/* + * sched_clutch_migrate_thread_group() + * + * Routine to implement the migration of threads when the thread group + * recommendation is updated. The migration works using a 2-phase + * algorithm. + * + * Phase 1: With the source pset (determined by sched_clutch_recommended_pset) + * locked, drain all the runnable threads into a local queue and update the TG + * recommendation. + * + * Phase 2: Call thread_setrun() on all the drained threads. Since the TG recommendation + * has been updated, these should all end up in the right hierarchy. + */ +static void +sched_clutch_migrate_thread_group(sched_clutch_t sched_clutch, cluster_type_t new_recommendation) +{ + thread_t thread; + + /* If the thread group is empty, just update the recommendation */ + if (os_atomic_load(&sched_clutch->sc_thr_count, relaxed) == 0) { + thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation); + return; + } + + processor_set_t dst_pset = sched_clutch_recommended_pset(sched_clutch, new_recommendation); + processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set; + + queue_head_t clutch_threads; + queue_init(&clutch_threads); + + /* Interrupts need to be disabled to make sure threads wont become runnable during the + * migration and attempt to grab the pset/thread locks. + */ + spl_t s = splsched(); + + pset_lock(src_pset); + for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + sched_clutch_bucket_t clutch_bucket = &(sched_clutch->sc_clutch_buckets[bucket]); + sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed); + if ((scb_root == NULL) || (scb_root->scr_pset == dst_pset)) { + /* Clutch bucket not runnable or already in the right hierarchy; nothing to do here */ + continue; + } + assert(scb_root->scr_pset == src_pset); + /* Now remove all the threads from the runq so that thread->runq is set correctly */ + sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads); + } + + /* + * Now that all the clutch buckets have been drained, update the TG recommendation. + * This operation needs to be done with the pset lock held to make sure that anyone + * coming in before the migration started would get the original pset as the root + * of this sched_clutch and attempt to hold the src_pset lock. Once the TG changes, + * all threads that are becoming runnable would find the clutch bucket empty and + * the TG recommendation would coax them to enqueue it in the new recommended + * hierarchy. This effectively synchronizes with other threads calling + * thread_setrun() and trying to decide which pset the thread/clutch_bucket + * belongs in. + */ + thread_group_update_recommendation(sched_clutch->sc_tg, new_recommendation); + pset_unlock(src_pset); + + /* Now setrun all the threads in the local queue */ + qe_foreach_element_safe(thread, &clutch_threads, runq_links) { + remqueue(&thread->runq_links); + thread_lock(thread); + thread_setrun(thread, SCHED_TAILQ); + thread_unlock(thread); + } + + splx(s); +} + +static void +sched_clutch_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_type_t new_recommendation) +{ + /* + * For the clutch scheduler, the change in recommendation moves the thread group + * to the right hierarchy. sched_clutch_migrate_thread_group() is also responsible + * for updating the recommendation of the thread group. + */ + sched_clutch_migrate_thread_group(&tg->tg_sched_clutch, new_recommendation); + + if (new_recommendation != CLUSTER_TYPE_P) { + return; + } + + sched_amp_bounce_thread_group_from_ecores(ecore_set, tg); +} + +/* + * sched_clutch_migrate_foreign_buckets() + * + * Routine to migrate all the clutch buckets which are not in their recommended + * pset hierarchy now that a new pset has become runnable. The algorithm is + * similar to sched_clutch_migrate_thread_group(). + * + * Invoked with the newly recommended pset lock held and interrupts disabled. + */ +static void +sched_clutch_migrate_foreign_buckets(__unused processor_t processor, processor_set_t dst_pset, boolean_t drop_lock) +{ + thread_t thread; + processor_set_t src_pset = (dst_pset == pcore_set) ? ecore_set : pcore_set; + + if (!sched_clutch_pset_available(dst_pset)) { + /* + * It is possible that some state about the pset changed, + * but its still not available for scheduling. Nothing to + * do here in that case. + */ + if (drop_lock) { + pset_unlock(dst_pset); + } + return; + } + pset_unlock(dst_pset); + + queue_head_t clutch_threads; + queue_init(&clutch_threads); + sched_clutch_root_t src_root = &src_pset->pset_clutch_root; + + pset_lock(src_pset); + queue_t clutch_bucket_list = &src_pset->pset_clutch_root.scr_foreign_buckets; + + if (sched_clutch_root_count(src_root) == 0) { + /* No threads present in this hierarchy */ + pset_unlock(src_pset); + goto migration_complete; + } + + sched_clutch_bucket_t clutch_bucket; + qe_foreach_element_safe(clutch_bucket, clutch_bucket_list, scb_foreignlink) { + sched_clutch_root_t scb_root = os_atomic_load(&clutch_bucket->scb_root, relaxed); + assert(scb_root->scr_pset == src_pset); + /* Now remove all the threads from the runq so that thread->runq is set correctly */ + sched_clutch_bucket_threads_drain(clutch_bucket, scb_root, &clutch_threads); + assert(clutch_bucket->scb_foreign == false); + } + pset_unlock(src_pset); + + /* Now setrun all the threads in the local queue */ + qe_foreach_element_safe(thread, &clutch_threads, runq_links) { + remqueue(&thread->runq_links); + thread_lock(thread); + thread_setrun(thread, SCHED_TAILQ); + thread_unlock(thread); + } + +migration_complete: + if (!drop_lock) { + pset_lock(dst_pset); + } +} + +#endif /* __AMP__ */ #endif /* CONFIG_SCHED_CLUTCH */ diff --git a/osfmk/kern/sched_clutch.h b/osfmk/kern/sched_clutch.h index 4cfad12f5..eef5bee4d 100644 --- a/osfmk/kern/sched_clutch.h +++ b/osfmk/kern/sched_clutch.h @@ -213,6 +213,10 @@ struct sched_clutch_bucket { /* (P) linkage for all clutch_buckets in a root bucket; used for tick operations */ queue_chain_t scb_listlink; +#if __AMP__ + /* (P) linkage for all "foreign" clutch buckets in the root clutch */ + queue_chain_t scb_foreignlink; +#endif /* __AMP__ */ /* (P) timestamp for the last time the interactivity score was updated */ uint64_t scb_interactivity_ts; diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index c312e0b4e..42e73b4f0 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -4139,7 +4139,11 @@ choose_processor( * platforms, simply return the master_processor. */ fallback_processor = true; +#if CONFIG_SCHED_CLUTCH && __AMP__ + processor = processor_array[lsb_first(starting_pset->primary_map)]; +#else /* CONFIG_SCHED_CLUTCH && __AMP__ */ processor = master_processor; +#endif /* CONFIG_SCHED_CLUTCH && __AMP__ */ } /* @@ -6069,6 +6073,11 @@ sched_update_pset_load_average(processor_set_t pset) pset->load_average = new_load_average; #if (DEVELOPMENT || DEBUG) +#if __AMP__ + if (pset->pset_cluster_type == PSET_AMP_P) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PSET_LOAD_AVERAGE) | DBG_FUNC_NONE, sched_get_pset_load_average(pset), (bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset))); + } +#endif #endif } @@ -6272,5 +6281,29 @@ sysctl_task_get_no_smt(void) __private_extern__ void thread_bind_cluster_type(char cluster_type) { +#if __AMP__ + thread_t thread = current_thread(); + + spl_t s = splsched(); + thread_lock(thread); + thread->sched_flags &= ~(TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY); + switch (cluster_type) { + case 'e': + case 'E': + thread->sched_flags |= TH_SFLAG_ECORE_ONLY; + break; + case 'p': + case 'P': + thread->sched_flags |= TH_SFLAG_PCORE_ONLY; + break; + default: + break; + } + thread_unlock(thread); + splx(s); + + thread_block(THREAD_CONTINUE_NULL); +#else /* __AMP__ */ (void)cluster_type; +#endif /* __AMP__ */ } diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 880e84960..9276e2563 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -585,6 +585,11 @@ extern boolean_t preemption_enabled(void); #error Enable at least one scheduler algorithm in osfmk/conf/MASTER.XXX #endif +#if __AMP__ +extern const struct sched_dispatch_table sched_amp_dispatch; +#define SCHED(f) (sched_amp_dispatch.f) + +#else /* __AMP__ */ #if CONFIG_SCHED_CLUTCH extern const struct sched_dispatch_table sched_clutch_dispatch; @@ -594,6 +599,7 @@ extern const struct sched_dispatch_table sched_dualq_dispatch; #define SCHED(f) (sched_dualq_dispatch.f) #endif /* CONFIG_SCHED_CLUTCH */ +#endif /* __AMP__ */ struct sched_dispatch_table { const char *sched_name; @@ -766,6 +772,9 @@ extern const struct sched_dispatch_table sched_traditional_with_pset_runqueue_di #if defined(CONFIG_SCHED_MULTIQ) extern const struct sched_dispatch_table sched_multiq_dispatch; extern const struct sched_dispatch_table sched_dualq_dispatch; +#if __AMP__ +extern const struct sched_dispatch_table sched_amp_dispatch; +#endif #endif #if defined(CONFIG_SCHED_PROTO) diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 832c774b4..fd98be481 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -1295,7 +1295,11 @@ init_task_ledgers(void) task_wakeups_rate_exceeded, NULL, NULL); ledger_set_callback(t, task_ledgers.physical_writes, task_io_rate_exceeded, (void *)FLAVOR_IO_PHYSICAL_WRITES, NULL); +#if XNU_MONITOR + ledger_template_complete_secure_alloc(t); +#else /* XNU_MONITOR */ ledger_template_complete(t); +#endif /* XNU_MONITOR */ task_ledger_template = t; } @@ -5540,6 +5544,27 @@ task_energy( return energy; } +#if __AMP__ + +uint64_t +task_cpu_ptime( + task_t task) +{ + uint64_t cpu_ptime = 0; + thread_t thread; + + task_lock(task); + cpu_ptime += task->total_ptime; + + queue_iterate(&task->threads, thread, thread_t, task_threads) { + cpu_ptime += timer_grab(&thread->ptime); + } + + task_unlock(task); + return cpu_ptime; +} + +#else /* __AMP__ */ uint64_t task_cpu_ptime( @@ -5548,6 +5573,7 @@ task_cpu_ptime( return 0; } +#endif /* __AMP__ */ /* This function updates the cpu time in the arrays for each * effective and requested QoS class diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 7242faac7..673259a16 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -283,6 +283,10 @@ struct thread { #define TH_SFLAG_BASE_PRI_FROZEN 0x0800 /* (effective) base_pri is frozen */ #define TH_SFLAG_WAITQ_PROMOTED 0x1000 /* promote reason: waitq wakeup (generally for IPC receive) */ +#if __AMP__ +#define TH_SFLAG_ECORE_ONLY 0x2000 /* Bind thread to E core processor set */ +#define TH_SFLAG_PCORE_ONLY 0x4000 /* Bind thread to P core processor set */ +#endif #define TH_SFLAG_EXEC_PROMOTED 0x8000 /* promote reason: thread is in an exec */ diff --git a/osfmk/mach/mach_host.defs b/osfmk/mach/mach_host.defs index 83d485388..a1b55f5eb 100644 --- a/osfmk/mach/mach_host.defs +++ b/osfmk/mach/mach_host.defs @@ -309,8 +309,8 @@ routine host_register_well_known_mach_voucher_attr_manager( * Update the global ATM diagnostic flag, readable from the commpage */ routine host_set_atm_diagnostic_flag( - host_priv : host_priv_t; - in diagnostic_flag : uint32_t); + host : host_t; + in diagnostic_flag : uint32_t); #if !KERNEL && LIBSYSCALL_INTERFACE routine host_get_atm_diagnostic_flag( diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 654bfc30d..6865cee72 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -423,6 +423,9 @@ __END_DECLS #define CPUFAMILY_ARM_HURRICANE 0x67ceee93 #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6 #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f +#ifndef RC_HIDE_XNU_LIGHTNING +#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2 +#endif /* !RC_HIDE_XNU_LIGHTNING */ /* The following synonyms are deprecated: */ #define CPUFAMILY_INTEL_6_23 CPUFAMILY_INTEL_PENRYN diff --git a/osfmk/man/index.html b/osfmk/man/index.html new file mode 100644 index 000000000..2a9d0ff9e --- /dev/null +++ b/osfmk/man/index.html @@ -0,0 +1,448 @@ + + + + Mach Kernel Interface Reference Manual + + +

Mach IPC Interface

+
+

+Mach IPC presents itself in a few forms: message queues, lock-sets, +and semaphores (more may be added in the future).  All share one common +charateristic: the capabilities presented by each are represented through +a handle known as a Mach port.  Specific rights represented in these +Mach port capability handles allow the underlying IPC object to be used and +manipulated in consistent ways.

+ +

Mach Message Queue Interface

+
+

+mach_msg - Send and/or receive a message from the target port.
+mach_msg_overwrite - Send and/or receive messages with possible overwrite.
+

+Mach Message Queue Data Structures +

+mach_msg_descriptor - Specifies an element of a complex IPC message.
+mach_msg_header - Specifies the content of an IPC message header.
+

+
+ +

Mach Lock-Set Interface

+
+

+lock_acquire - Acquire ownership a lock
+lock_handoff - Hand-off ownership of a lock.
+lock_handoff_accept - Accept lock ownership from a handoff.
+lock_make_stable - Stabilize the state of the specified lock.
+lock_release - Release ownership of a lock.
+lock_set_create - Create a new lock set.
+lock_set_destroy - Destroy a lock set and its associated locks.
+lock_try - Attempt to acquire access rights to a lock.
+

+
+ +

Mach Semaphore Interface

+
+

+semaphore_create - Create a new semaphore.
+semaphore_destroy - Destroy a semaphore.
+semaphore_signal - Increments the semaphore count.
+semaphore_signal_all - Wake up all threads blocked on a semaphore.
+semaphore_wait - Wait on the specified semaphore.
+

+
+ +

Mach Port Management Interface

+
+

+mach_port_allocate - Create caller-specified type of port right.
+mach_port_allocate_full - Create a port right with full Mach port semantics.
+mach_port_allocate_name - Create a port right with the caller-specified name.
+mach_port_allocate_qos - Allocate a port with specified "quality of service".
+mach_port_allocate_subsystem - Create a port right associated with the caller-specified subsystem.
+mach_port_deallocate - Decrement the target port right's user reference count.
+mach_port_destroy - Deallocate all port rights associated with specified name.
+mach_port_extract_right - Remove the specified right from the target task and return it to the caller.
+mach_port_get_attributes - Return information about target port as specified by the caller.
+mach_port_get_refs - Return the current count of user references on the target port right.
+mach_port_get_set_status - Return the port right names contained in the target port set.
+mach_port_insert_right - Insert the specified port right into the target task.
+mach_port_mod_refs - Modify the specified port right's count of user references.
+mach_port_move_member - Move the specified receive right into or out of the specified port set.
+mach_port_names - Return information about a task's port name space.
+mach_port_request_notification - Request notification of the specified port event type.
+mach_port_set_attributes - Set the target port's attributes.
+mach_port_set_mscount - Change the target port's make-send count.
+mach_port_set_seqno - Change the current value of the target port's sequence number.
+mach_port_type - Return the characteristics of the target port name.
+mach_reply_port - Allocate a new port and insert corresponding receive right in the calling task.
+ mach_subsystem_create - Used by a server to register information about an RPC subsystem with the kernel.
+

+Mach Port Data Structures +

+mach_port_limits - Specifies a port's resource and message queue limits.
+mach_port_qos - Specifies a port's attributes with respect to "Quality Of Service."
+mach_port_status - Used to present a port's current status with respect to various important attributes.
+

+Mach Port Notification Callbacks +

+do_mach_notify_dead_name - Handle the current instance of a dead-name notification.
+do_mach_notify_no_senders - Handle the current instance of a no-more-senders notification.
+do_mach_notify_port_deleted - Handle the current instance of a port-deleted notification.
+do_mach_notify_port_destroyed - Handle the current instance of a port-destroyed notification.
+do_mach_notify_send_once - Handle the current instance of a send-once notification.
+

+Mach Port Notification Callback Server Helpers +

+notify_server - Detect and handle a kernel-generated IPC notification.
+

+
+ +
+ +

Mach Virtual Memory Interface

+
+

Mach Virtual Memory Address Space Manipulation Interface

+
+

+host_page_size - Provide the system's virtual page size.
+vm_allocate - Allocate a region of virtual memory.
+vm_behavior_set - Specify expected access patterns for the target VM region.
+vm_copy - Copy a region of virtual memory.
+vm_deallocate - Deallocate a region of virtual memory.
+vm_inherit - Set a VM region's inheritance attribute.
+vm_machine_attribute - Get/set the target memory region's special attributes.
+vm_map - Map the specified memory object to a region of virtual memory.
+vm_msync - Synchronize the specified region of virtual memory.
+vm_protect - Set access privilege attribute for a region of virtual memory.
+vm_read - Read the specified range of target task's address space.
+vm_region - Return description of a virtual memory region.
+vm_remap - Map memory objects in one address space to that of another's.
+ vm_wire - Modify the target region's paging characteristics.
+vm_write - Write data to the specified address in the target address space.
+

+Data Structures +

+vm_region_basic_info - Defines the attributes of a task's memory region.
+vm_statistics - Defines statistics for the kernel's use of virtual memory.
+

+
+ +

External Memory Management Interface

+
+The External Memory Management Interface (EMMI) is undergoing significant change in the Darwin system. +For this reason, the interface is not currently available to user-level programs. Even for kernel +extensions, use of these interfaces in not supported. Instead, the BSD filesystem's Universal Buffer Cache (UBC) +mechanism should be used.
+

+memory_object_change_attributes - Modify subset of memory object attributes.
+memory_object_destroy - Shut down a memory object.
+memory_object_get_attributes - Return current attributes for a memory object.
+memory_object_lock_request - Restrict access to memory object data.
+memory_object_synchronize_completed - Synchronized data has been processed.
+

+Data Structures +

+memory_object_attr_info - Defines memory object attributes.
+memory_object_perf_info- Specifies performance-related memory object attributes.
+

+External Memory Manager Interface Callbacks +

+memory_object_create - Assign a new memory object to the default memory manager.
+memory_object_data_initialize - Provide initial data for a new memory object.
+memory_object_data_request - Request that memory manager page-in specified data.
+memory_object_data_return - Return memory object data to the appropriate memory manager.
+memory_object_data_unlock - Request a memory manager release the lock on specific data.
+memory_object_init - Inform a memory manager on first use of a memory object.
+memory_object_synchronize - Request synchronization of data with backing store.
+memory_object_terminate - Relinquish access to a memory object.
+

+EMMI Callback Server Helpers +

+memory_object_default_server - Handle kernel operation request targeted for the default pager.
+memory_object_server - Handle kernel operation request aimed at a given memory manager.
+

+
+ +

Default Memory Management Interface

+
+

+default_pager_add_segment - Add additional backing storage for a default pager.
+default_pager_backing_store_create - Create a backing storage object.
+ default_pager_backing_store_delete - Delete a backing storage object.
+default_pager_backing_store_info - Return information about a backing storage object.
+default_pager_info - Furnish caller with information about the default pager.
+default_pager_object_create - Initialize a non-persistent memory object.
+host_default_memory_manager - Register/Lookup the host's default pager.
+

+
+ +
+ +

Process Management Interface

+
+ +

Task Interface

+
+

+mach_ports_lookup - Provide caller with an array of the target task's well-known ports.
+mach_ports_register - Register an array of well-known ports on behalf of the target task.
+mach_task_self - Return a send right to the caller's task_self port.
+task_create - Create a new task.
+task_get_emulation_vector - Return an array identifying the target task's user-level system call handlers.
+task_get_exception_ports - Return send rights to the target task's exception ports.
+task_get_special_port - Return a send write to the indicated special port.
+task_info - Return per-task information according to specified flavor.
+task_resume - Decrement the target task's suspend count.
+task_sample - Sample the target task's thread program counters periodically.
+task_set_emulation - Establish a user-level handler for a system call.
+task_set_emulation_vector - Establish the target task's user-level system call handlers.
+task_set_exception_ports - Set target task's exception ports.
+task_set_info - Set task-specific information state.
+task_set_port_space - Set the size of the target task's port name space table.
+task_set_special_port - Set the indicated special port.
+task_suspend - Suspend the target task.
+task_swap_exception_ports - Set target task's exception ports, returning the previous exception ports.
+task_terminate - Terminate the target task and deallocate its resources.
+task_threads - Return the target task's list of threads.
+

+Task Data Structures +

+task_basic_info - Defines basic information for a task.
+task_thread_times_info - Defines thread execution times information for tasks.
+

+
+ +

Thread Interface

+
+

+mach_thread_self - Returns the thread self port.
+thread_abort - Abort a thread.
+thread_abort_safely - Abort a thread, restartably.
+thread_create - Create a thread within a task.
+thread_create_running - Optimized creation of a running thread.
+thread_depress_abort - Cancel thread scheduling depression.
+thread_get_exception_ports - Return a send right to an exception port.
+thread_get_special_port - Return a send right to the caller-specified special port.
+thread_get_state - Return the execution state for a thread.
+thread_info - Return information about a thread.
+thread_resume - Resume a thread.
+thread_sample - Perform periodic PC sampling for a thread.
+thread_set_exception_ports - Set exception ports for a thread.
+thread_set_special_port - Set caller-specified special port belonging to the target thread.
+thread_set_state - Set the target thread's user-mode execution state.
+thread_suspend - Suspend a thread.
+thread_swap_exception_ports - Swap exception ports for a thread.
+thread_terminate - Destroy a thread.
+thread_wire - Mark the thread as privileged with respect to kernel resources.
+

+Thread Data Structures +

+thread_basic_info - Defines basic information for a thread.
+

+Thread Exception Callbacks +

+catch_exception_raise - Handles the occurrence of an exception within a thread.
+

+Thread Exception Callback Server Helpers +

+exc_server - Handle kernel-reported thread exception.
+

+
+ +

Scheduling Interface

+
+

+task_policy - Set target task's default scheduling policy state.
+task_set_policy - Set target task's default scheduling policy state.
+thread_policy - Set target thread's scheduling policy state.
+thread_set_policy - Set target thread's scheduling policy state.
+thread_switch - Cause context switch with options.
+

+Scheduling Data Structures +

+policy_fifo_info - Specifies information associated with the system's First-In-First-Out scheduling policy.
+policy_rr_info - Specifies information associated with the system's Round Robin scheduling policy.
+policy_timeshare_info - Specifies information associated with the system's Timeshare scheduling policy.
+

+
+
+ +

System Management Interface

+
+ +

Host Interface

+
+

+host_get_clock_service - Return a send right to a kernel clock's service port.
+host_get_time - Returns the current time as seen by that host.
+host_info - Return information about a host.
+host_kernel_version - Return kernel version information for a host.
+host_statistics - Return statistics for a host.
+mach_host_self - Returns send rights to the task's host self port.
+

+Data Structures +

+host_basic_info - Used to present basic information about a host.
+host_load_info - Used to present a host's processor load information.
+host_sched_info - - Used to present the set of scheduler limits associated with the host.
+kernel_resource_sizes - Used to present the sizes of kernel's major structures.
+

+
+ +

Host Control Interface

+
+

+host_adjust_time - Arranges for the time on a specified host to be gradually changed by an adjustment value.
+host_default_memory_manager - Set the default memory manager.
+host_get_boot_info - Return operator boot information.
+host_get_clock_control - Return a send right to a kernel clock's control port.
+host_processor_slots - Return a list of numbers that map processor slots to active processors.
+host_processors - Return a list of send rights representing all processor ports.
+host_reboot - Reboot this host.
+host_set_time - Establishes the time on the specified host.
+

+
+ +

Host Security Interface

+
+

+host_security_create_task_token - Create a new task with an explicit security token.
+host_security_set_task_token - Change the target task's security token.
+

+
+ +

Resource Accounting Interface

+
+ +The Mach resource accounting mechanism is not functional in the current Mac OS X/Darwin system. It will become functional in a future release. + +

+ledger_create - Create a subordinate ledger.
+ledger_read - Return the ledger limit and balance.
+ledger_terminate - Destroy a ledger.
+ledger_transfer - Transfer resources from a parent ledger to a child.
+

+
+ +

Processor Management Interface

+
+

+processor_control - Perform caller-specified operation on target processor.
+processor_exit - Exit a processor.
+processor_info - Return information about a processor.
+processor_start - Start a processor.
+

+Processor Data Structures +

+processor_basic_info - Defines the basic information about a processor.
+

+
+ +

Processor Set Interface

+
+ +The processor set interface allows for the grouping of tasks and +processors for the purpose of exclusive scheduling. These interface +are deprecated and should not be used in code that isn't tied +to a particular release of Mac OS X/Darwin. These will likely change +or disappear in a future release. + +

+host_processor_sets - Return a list of send rights representing all processor set name ports.
+host_processor_set_priv - Translate a processor set name port into a processor set control port.
+processor_assign - Assign a processor to a processor set.
+processor_get_assignment - Get current assignment for a processor.
+processor_set_create - Create a new processor set.
+processor_set_default - Return the default processor set.
+processor_set_destroy - Destroy the target processor set.
+processor_set_info - Return processor set state according to caller-specified flavor.
+processor_set_max_priority - Sets the maximum scheduling priority for a processor set.
+processor_set_policy_control - Set target processor set's scheduling policy state.
+processor_set_policy_disable - Enables a scheduling policy for a processor set.
+processor_set_policy_enable - Enables a scheduling policy for a processor set.
+processor_set_statistics - Return scheduling statistics for a processor set.
+processor_set_tasks - Return all tasks currently assigned to the target processor set.
+processor_set_threads - Return all threads currently assigned to the target processor set.
+task_assign - Assign a task to a processor set.
+task_assign_default - Assign a task to the default processor set.
+task_get_assignment - Create a new task with an explicit security token.
+thread_assign - Assign a thread to a processor set.
+thread_assign_default - Assign a thread to the default processor set.
+thread_get_assignment - Return the processor set to which a thread is assigned.
+

+Processor Set Data Structures +

+processor_set_basic_info - Defines the basic information about a processor set.
+processor_set_load_info - Defines the scheduling statistics for a processor set.
+

+
+ +

Clock Interface

+
+

+clock_alarm - Set up an alarm.
+clock_get_attributes - Return attributes of a clock.
+clock_get_time - Return the current time.
+clock_map_time - Return a memory object that maps a clock.
+clock_set_attributes - Set a particular clock's attributes.
+clock_set_time - Set the current time.
+clock_sleep - Delay the invoking thread until a specified time.
+

+Clock Data Structures +

+mapped_tvalspec - Specifies the format the kernel uses to maintain a mapped clock's time.
+tvalspec - Defines format of system time values.
+

+Clock Interface Callbacks +

+clock_alarm_reply - Ring a preset alarm.
+

+Clock Callback Server Helpers +

+ clock_reply_server - Handle kernel-generated alarm.
+

+
+ +

Multi-Computer Support Interface

+
+ +These multi-computer support interfaces are no longer supported by +the Mac OS X/Darwin kernel. If and when multi-computer support is +added back in, something like these will likely be added. + +

+host_page_size - Returns the page size for the given host.
+ledger_get_remote - Return send right to specified host's remote ledger port.
+ledger_set_remote - Set this host's remote ledger port.
+

+
+ +
+ +

Machine Specific Interface

+
+ +

Intel 386 Support

+
+

+i386_get_ldt - Returns per-thread segment descriptors from the local descriptor table (LDT).
+i386_io_port_add - Adds a device to the I/O permission bitmap for a thread.
+i386_io_port_list - Returns a list of the devices named in the thread's I/O permission bitmap.
+i386_io_port_remove - Removes the specified device from the thread's I/O permission bitmap.
+i386_set_ldt - Allows a thread to have a private local descriptor table (LDT).
+

+
+ +

PowerPC Support

+
+

+

+
+ +
+ + + + + diff --git a/osfmk/tests/pmap_tests.c b/osfmk/tests/pmap_tests.c index 99624e77e..a12ca10c8 100644 --- a/osfmk/tests/pmap_tests.c +++ b/osfmk/tests/pmap_tests.c @@ -32,6 +32,9 @@ #include #if defined(__arm64__) #include +#if XNU_MONITOR +#include +#endif #endif extern ledger_template_t task_ledger_template; @@ -122,7 +125,152 @@ test_pmap_enter_disconnect(unsigned int num_loops) kern_return_t test_pmap_iommu_disconnect(void) { +#if XNU_MONITOR + kern_return_t kr = KERN_SUCCESS; + pmap_t new_pmap = pmap_create_wrapper(0); + + vm_page_t m = vm_page_grab(); + + vm_page_lock_queues(); + if (m != VM_PAGE_NULL) { + vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE); + } + vm_page_unlock_queues(); + + shart_ppl *iommu = NULL; + kr = pmap_iommu_init(shart_get_desc(), "sharttest0", NULL, 0, (ppl_iommu_state**)(&iommu)); + + if (kr != KERN_SUCCESS) { + goto cleanup; + } + + if ((new_pmap == NULL) || (m == VM_PAGE_NULL) || (iommu == NULL)) { + kr = KERN_FAILURE; + goto cleanup; + } + + ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); + + const ppl_iommu_seg shart_segs[] = { + {.iova = 0, + .paddr = ptoa(phys_page), + .nbytes = PAGE_SIZE, + .prot = VM_PROT_READ, + .refcon = 0}, + + {.iova = 1, + .paddr = ptoa(phys_page), + .nbytes = PAGE_SIZE, + .prot = VM_PROT_READ | VM_PROT_WRITE, + .refcon = 0}, + + {.iova = 2, + .paddr = ptoa(phys_page), + .nbytes = PAGE_SIZE, + .prot = VM_PROT_READ, + .refcon = 0}, + + {.iova = 3, + .paddr = ptoa(phys_page), + .nbytes = PAGE_SIZE, + .prot = VM_PROT_READ, + .refcon = 0} + }; + + /* Phase 1: one CPU mapping */ + kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(pmap_verify_free(phys_page)); + + /* Phase 2: two CPU mappings */ + kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(pmap_verify_free(phys_page)); + + /* Phase 3: one IOMMU mapping */ + kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, shart_segs, 1, 0, NULL); + assert(pmap_verify_free(phys_page)); + + /* Phase 4: two IOMMU mappings */ + kr = pmap_iommu_map(&iommu->super, shart_segs, 2, 0, NULL); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, &shart_segs[1], 1, 0, NULL); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, shart_segs, 1, 0, NULL); + assert(pmap_verify_free(phys_page)); + + /* Phase 5: combined CPU and IOMMU mappings */ + kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL); + assert(kr == KERN_SUCCESS); + kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + kr = pmap_iommu_map(&iommu->super, &shart_segs[1], 2, 0, NULL); + assert(kr == KERN_SUCCESS); + kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + kr = pmap_iommu_map(&iommu->super, &shart_segs[3], 1, 0, NULL); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, shart_segs, 4, 0, NULL); + assert(pmap_verify_free(phys_page)); + + /* Phase 6: differently combined CPU and IOMMU mappings */ + kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + kr = pmap_iommu_map(&iommu->super, &shart_segs[1], 3, 0, NULL); + assert(kr == KERN_SUCCESS); + kr = pmap_enter(new_pmap, PMAP_TEST_VA + PAGE_SIZE, phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + kr = pmap_iommu_map(&iommu->super, shart_segs, 1, 0, NULL); + assert(kr == KERN_SUCCESS); + kr = pmap_enter(new_pmap, PMAP_TEST_VA + (2 * PAGE_SIZE), phys_page, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, &shart_segs[2], 1, 0, NULL); + assert(!pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(!pmap_verify_free(phys_page)); + pmap_iommu_unmap(&iommu->super, shart_segs, 4, 0, NULL); + assert(pmap_verify_free(phys_page)); + pmap_disconnect(phys_page); + assert(pmap_verify_free(phys_page)); + +cleanup: + + if (iommu != NULL) { + pmap_iommu_ioctl(&iommu->super, SHART_IOCTL_TEARDOWN, NULL, 0, NULL, 0); + } + vm_page_lock_queues(); + if (m != VM_PAGE_NULL) { + vm_page_free(m); + } + vm_page_unlock_queues(); + if (new_pmap != NULL) { + pmap_destroy(new_pmap); + } + + return kr; +#else return KERN_SUCCESS; +#endif } kern_return_t diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 305c8d677..6146c8e40 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -15951,6 +15951,13 @@ vm_map_remap_extract( if (!copy) { if (src_entry->used_for_jit == TRUE) { if (same_map) { +#if __APRR_SUPPORTED__ + /* + * Disallow re-mapping of any JIT regions on APRR devices. + */ + result = KERN_PROTECTION_FAILURE; + break; +#endif /* __APRR_SUPPORTED__*/ } else { #if CONFIG_EMBEDDED /* diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 21b7d3951..33344f15e 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -347,6 +347,11 @@ uint32_t vm_pageout_memorystatus_fb_factor_dr = 2; #endif +#if __AMP__ +int vm_compressor_ebound = 1; +int vm_pgo_pbound = 0; +extern void thread_bind_cluster_type(char); +#endif /* __AMP__ */ /* @@ -3932,7 +3937,16 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0); q = cq->q; +#if __AMP__ + if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) { + local_batch_size = (q->pgo_maxlaundry >> 3); + local_batch_size = MAX(local_batch_size, 16); + } else { + local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2); + } +#else local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2); +#endif #if RECORD_THE_COMPRESSED_DATA if (q->pgo_laundry) { @@ -4317,6 +4331,11 @@ vm_pageout_iothread_internal(struct cq *cq) } +#if __AMP__ + if (vm_compressor_ebound) { + thread_bind_cluster_type('E'); + } +#endif /* __AMP__ */ thread_set_thread_name(current_thread(), "VM_compressor"); #if DEVELOPMENT || DEBUG @@ -4723,6 +4742,12 @@ vm_pageout(void) +#if __AMP__ + PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound)); + if (vm_pgo_pbound) { + thread_bind_cluster_type('P'); + } +#endif /* __AMP__ */ splx(s); @@ -4996,6 +5021,12 @@ vm_pageout_internal_start(void) PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count, sizeof(vm_pageout_state.vm_compressor_thread_count)); +#if __AMP__ + PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound)); + if (vm_compressor_ebound) { + vm_pageout_state.vm_compressor_thread_count = 2; + } +#endif if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) { vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1; } diff --git a/pexpert/arm/pe_identify_machine.c b/pexpert/arm/pe_identify_machine.c index 34ec23be7..b35a029e9 100644 --- a/pexpert/arm/pe_identify_machine.c +++ b/pexpert/arm/pe_identify_machine.c @@ -309,10 +309,25 @@ static struct tbd_ops t8011_funcs = {NULL, NULL, NULL}; static struct tbd_ops t8015_funcs = {NULL, NULL, NULL}; #endif /* defined(ARM_BOARD_CLASS_T8015) */ +#if defined(ARM_BOARD_CLASS_T8020) +static struct tbd_ops t8020_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8020) */ +#if defined(ARM_BOARD_CLASS_T8006) +static struct tbd_ops t8006_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8006) */ +#if defined(ARM_BOARD_CLASS_T8027) +static struct tbd_ops t8027_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8027) */ +#if defined(ARM_BOARD_CLASS_T8028) +static struct tbd_ops t8028_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8028) */ +#if defined(ARM_BOARD_CLASS_T8030) +static struct tbd_ops t8030_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8030) */ @@ -733,6 +748,31 @@ pe_arm_init_timer(void *args) tbd_funcs = &t8015_funcs; } else #endif +#if defined(ARM_BOARD_CLASS_T8020) + if (!strcmp(gPESoCDeviceType, "t8020-io")) { + tbd_funcs = &t8020_funcs; + } else +#endif +#if defined(ARM_BOARD_CLASS_T8006) + if (!strcmp(gPESoCDeviceType, "t8006-io")) { + tbd_funcs = &t8006_funcs; + } else +#endif +#if defined(ARM_BOARD_CLASS_T8027) + if (!strcmp(gPESoCDeviceType, "t8027-io")) { + tbd_funcs = &t8027_funcs; + } else +#endif +#if defined(ARM_BOARD_CLASS_T8028) + if (!strcmp(gPESoCDeviceType, "t8028-io")) { + tbd_funcs = &t8028_funcs; + } else +#endif +#if defined(ARM_BOARD_CLASS_T8030) + if (!strcmp(gPESoCDeviceType, "t8030-io")) { + tbd_funcs = &t8030_funcs; + } else +#endif #if defined(ARM_BOARD_CLASS_BCM2837) if (!strcmp(gPESoCDeviceType, "bcm2837-io")) { tbd_funcs = &bcm2837_funcs; diff --git a/pexpert/pexpert/arm64/arm64_common.h b/pexpert/pexpert/arm64/arm64_common.h index 3d32aca8b..fe0b98768 100644 --- a/pexpert/pexpert/arm64/arm64_common.h +++ b/pexpert/pexpert/arm64/arm64_common.h @@ -192,12 +192,79 @@ #endif /* defined (HAS_KTRR) */ +#if defined(HAS_CTRR) +#ifdef ASSEMBLER +#define ARM64_REG_CTRR_A_LWR_EL1 S3_4_c15_c2_3 +#define ARM64_REG_CTRR_A_UPR_EL1 S3_4_c15_c2_4 +#define ARM64_REG_CTRR_CTL_EL1 S3_4_c15_c2_5 +#define ARM64_REG_CTRR_LOCK_EL1 S3_4_c15_c2_2 + +#define ACC_CTRR_A_LWR_EL2 S3_4_c15_c11_0 +#define ACC_CTRR_A_UPR_EL2 S3_4_c15_c11_1 +#define ACC_CTRR_CTL_EL2 S3_4_c15_c11_4 +#define ACC_CTRR_LOCK_EL2 S3_4_c15_c11_5 +#else /* ASSEMBLER */ +#define ARM64_REG_CTRR_A_LWR_EL1 "S3_4_c15_c2_3" +#define ARM64_REG_CTRR_A_UPR_EL1 "S3_4_c15_c2_4" +#define ARM64_REG_CTRR_CTL_EL1 "S3_4_c15_c2_5" +#define ARM64_REG_CTRR_LOCK_EL1 "S3_4_c15_c2_2" + +#define ACC_CTRR_A_LWR_EL2 "S3_4_c15_c11_0" +#define ACC_CTRR_A_UPR_EL2 "S3_4_c15_c11_1" +#define ACC_CTRR_CTL_EL2 "S3_4_c15_c11_4" +#define ACC_CTRR_LOCK_EL2 "S3_4_c15_c11_5" +#endif /* ASSEMBLER */ + +#define CTRR_CTL_EL1_A_MMUOFF_WRPROTECT (1 << 0) +#define CTRR_CTL_EL1_A_MMUON_WRPROTECT (1 << 1) +#define CTRR_CTL_EL1_B_MMUOFF_WRPROTECT (1 << 2) +#define CTRR_CTL_EL1_B_MMUON_WRPROTECT (1 << 3) +#define CTRR_CTL_EL1_A_PXN (1 << 4) +#define CTRR_CTL_EL1_B_PXN (1 << 5) +#define CTRR_CTL_EL1_A_UXN (1 << 6) +#define CTRR_CTL_EL1_B_UXN (1 << 7) + +#endif /* defined (HAS_CTRR) */ + +#if defined(HAS_IPI) + +#define ARM64_REG_IPI_RR_TYPE_IMMEDIATE (0 << 28) +#define ARM64_REG_IPI_RR_TYPE_RETRACT (1 << 28) +#define ARM64_REG_IPI_RR_TYPE_DEFERRED (2 << 28) +#define ARM64_REG_IPI_RR_TYPE_NOWAKE (3 << 28) + +#if defined(HAS_CLUSTER) +#define ARM64_REG_IPI_RR_LOCAL "S3_5_c15_c0_0" +#define ARM64_REG_IPI_RR_GLOBAL "S3_5_c15_c0_1" +#else /* defined(HAS_CLUSTER) */ +#define ARM64_REG_IPI_RR "S3_5_c15_c0_1" +#endif /* defined(HAS_CLUSTER) */ + +#define ARM64_REG_IPI_SR "S3_5_c15_c1_1" +#define ARM64_REG_IPI_CR "S3_5_c15_c3_1" + +#endif /* defined(HAS_IPI) */ #endif /* APPLE_ARM64_ARCH_FAMILY */ +#if defined(HAS_NEX_PG) +#define ARM64_REG_HID13 S3_0_c15_c14_0 +#define ARM64_REG_HID13_RstCyc_mask (0xfULL << 60) +#define ARM64_REG_HID13_RstCyc_val (0xcULL << 60) + +#define ARM64_REG_HID14 S3_0_c15_c15_0 +#define ARM64_REG_HID14_NexPwgEn (1ULL << 32) +#endif /* defined(HAS_NEX_PG) */ +#if defined(HAS_BP_RET) +#define ARM64_REG_ACC_CFG S3_5_c15_c4_0 +#define ARM64_REG_ACC_CFG_bdpSlpEn (1ULL << 2) +#define ARM64_REG_ACC_CFG_btpSlpEn (1ULL << 3) +#define ARM64_REG_ACC_CFG_bpSlp_mask 3 +#define ARM64_REG_ACC_CFG_bpSlp_shift 2 +#endif /* defined(HAS_BP_RET) */ #if defined(HAS_APPLE_PAC) @@ -246,6 +313,18 @@ #endif /* ASSEMBLER */ #endif /* HAS_APPLE_PAC */ +#if defined(HAS_VMSA_LOCK) + +#define ARM64_REG_VMSA_LOCK_EL1 S3_4_c15_c1_2 + +#define VMSA_LOCK_VBAR_EL1 (1ULL << 0) +#define VMSA_LOCK_SCTLR_EL1 (1ULL << 1) +#define VMSA_LOCK_TCR_EL1 (1ULL << 2) +#define VMSA_LOCK_TTBR0_EL1 (1ULL << 3) +#define VMSA_LOCK_TTBR1_EL1 (1ULL << 4) +#define VMSA_LOCK_SCTLR_M_BIT (1ULL << 63) + +#endif /* HAS_VMSA_LOCK */ diff --git a/pexpert/pexpert/arm64/board_config.h b/pexpert/pexpert/arm64/board_config.h index bad756857..90851847f 100644 --- a/pexpert/pexpert/arm64/board_config.h +++ b/pexpert/pexpert/arm64/board_config.h @@ -146,10 +146,125 @@ #endif #endif /* ARM64_BOARD_CONFIG_T8015 */ +#ifdef ARM64_BOARD_CONFIG_T8020 +/* + * The LLC size for Vortex is 8MB, but the LLC on Tempest is only 2MB. + * We use the larger cache size here. The expectation is + * that this may cause flushes from Tempest to be less efficient + * (cycles will be wasted on unnecessary way/set operations), but it + * will be technically correct... the best kind of correct. + */ +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLEVORTEX +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_CTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 23 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8020 +#define CPU_COUNT 6 +#define CPU_CLUSTER_OFFSETS {0, 4} +#define HAS_UNCORE_CTRS 1 +#define UNCORE_VERSION 2 +#define UNCORE_PER_CLUSTER 1 +#define UNCORE_NCTRS 16 +#define CORE_NCTRS 10 +#define PMAP_PV_LOAD_FACTOR 5 +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 1 +#endif /* ARM64_BOARD_CONFIG_T8020 */ +#ifdef ARM64_BOARD_CONFIG_T8006 +/* + * The T8006 consists of 2 Tempest cores (i.e. T8020 eCores) and for most + * of our purposes here may be considered a functional subset of T8020. + */ +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLEVORTEX +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_CTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 21 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8006 +#define PEXPERT_NO_3X_IMAGES 1 +#define CORE_NCTRS 10 +#define PMAP_PV_LOAD_FACTOR 5 +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 1 +#endif /* ARM64_BOARD_CONFIG_T8006 */ +#ifdef ARM64_BOARD_CONFIG_T8027 +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLEVORTEX +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_CTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 23 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8027 +#define CPU_COUNT 8 +#define CPU_CLUSTER_OFFSETS {0, 4} +#define HAS_UNCORE_CTRS 1 +#define UNCORE_VERSION 2 +#define UNCORE_PER_CLUSTER 1 +#define UNCORE_NCTRS 16 +#define CORE_NCTRS 10 +#define PMAP_PV_LOAD_FACTOR 5 +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 1 +#endif /* ARM64_BOARD_CONFIG_T8027 */ +#ifdef ARM64_BOARD_CONFIG_T8028 +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLEVORTEX +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_CTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 23 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8028 +#define CPU_COUNT 8 +#define CPU_CLUSTER_OFFSETS {0, 4} +#define HAS_UNCORE_CTRS 1 +#define UNCORE_VERSION 2 +#define UNCORE_PER_CLUSTER 1 +#define UNCORE_NCTRS 16 +#define CORE_NCTRS 10 +#define PMAP_PV_LOAD_FACTOR 5 +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 1 +#endif /* ARM64_BOARD_CONFIG_T8028 */ +#ifdef ARM64_BOARD_CONFIG_T8030 +/* + * The LLC size for Lightning is 8MB, but the LLC on Thunder is only 4MB. + * We use the larger cache size here. The expectation is + * that this may cause flushes from Tempest to be less efficient + * (cycles will be wasted on unnecessary way/set operations), but it + * will be technically correct... the best kind of correct. + */ +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLELIGHTNING +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_CTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 23 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8030 +#define CPU_COUNT 6 +#define CPU_CLUSTER_OFFSETS {0, 4} +#define CPU_PIO_RO_CTL_OFFSETS {0x210055000, 0x210155000, 0x210255000, 0x210355000, 0x211055000, 0x211155000} +#define CLUSTER_PIO_RO_CTL_OFFSETS {0x210e49000, 0x211e49000} +#define HAS_UNCORE_CTRS 1 +#define UNCORE_VERSION 2 +#define UNCORE_PER_CLUSTER 1 +#define UNCORE_NCTRS 16 +#define CORE_NCTRS 10 +#define PMAP_PV_LOAD_FACTOR 7 +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 1 +#endif /* ARM64_BOARD_CONFIG_T8030 */ diff --git a/san/kasan-arm64.c b/san/kasan-arm64.c index 909a075ef..7fa3a8e56 100644 --- a/san/kasan-arm64.c +++ b/san/kasan-arm64.c @@ -265,7 +265,7 @@ kasan_arch_init(void) /* Map the physical aperture */ kasan_map_shadow(kernel_vtop, physmap_vtop - kernel_vtop, true); -#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) /* Pre-allocate all the L3 page table pages to avoid triggering KTRR */ kasan_map_shadow_internal(VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS + 1, false, false); #endif diff --git a/security/mac_framework.h b/security/mac_framework.h index 3f9b67198..e8c27a348 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -339,6 +339,7 @@ void mac_posixshm_label_init(struct pshminfo *pshm); int mac_priv_check(kauth_cred_t cred, int priv); int mac_priv_grant(kauth_cred_t cred, int priv); int mac_proc_check_debug(proc_t proc1, proc_t proc2); +int mac_proc_check_dump_core(proc_t proc); int mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor); int mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op); int mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op); diff --git a/security/mac_policy.h b/security/mac_policy.h index 9baaa2df9..1b46adf7a 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -2751,6 +2751,19 @@ typedef int mpo_priv_grant_t( kauth_cred_t cred, int priv ); +/** + * @brief Access control over process core dumps + * @param proc Subject process + * + * Determine whether a core dump may be written to disk for the subject + * identified. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_proc_check_dump_core_t( + struct proc *proc + ); /** * @brief Access control check for debugging process * @param cred Subject credential @@ -6283,7 +6296,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 58 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 59 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6474,8 +6487,8 @@ struct mac_policy_ops { mpo_proc_check_setlcid_t *mpo_proc_check_setlcid; mpo_proc_check_signal_t *mpo_proc_check_signal; mpo_proc_check_wait_t *mpo_proc_check_wait; + mpo_proc_check_dump_core_t *mpo_proc_check_dump_core; mpo_reserved_hook_t *mpo_reserved5; - mpo_reserved_hook_t *mpo_reserved6; mpo_socket_check_accept_t *mpo_socket_check_accept; mpo_socket_check_accepted_t *mpo_socket_check_accepted; diff --git a/security/mac_process.c b/security/mac_process.c index 603b7499c..31d539af2 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -327,6 +327,26 @@ mac_proc_check_debug(proc_t curp, struct proc *proc) return error; } +int +mac_proc_check_dump_core(struct proc *proc) +{ + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_proc_enforce) { + return 0; + } +#endif + if (!mac_proc_check_enforce(proc)) { + return 0; + } + + MAC_CHECK(proc_check_dump_core, proc); + + return error; +} + int mac_proc_check_fork(proc_t curp) { diff --git a/tests/Makefile b/tests/Makefile index c559c84d1..610cecb15 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -38,6 +38,9 @@ CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign_allocate) atm_diagnostic_flag: OTHER_CFLAGS += drop_priv.c +atm_diagnostic_flag_entitled: CODE_SIGN_ENTITLEMENTS = atm_diagnostic_flag.entitlements +atm_diagnostic_flag_entitled: OTHER_CFLAGS += drop_priv.c + testposixshm: INVALID_ARCHS = i386 avx: INVALID_ARCHS = i386 diff --git a/tests/atm_diagnostic_flag.c b/tests/atm_diagnostic_flag.c index 021930149..88d314e7a 100644 --- a/tests/atm_diagnostic_flag.c +++ b/tests/atm_diagnostic_flag.c @@ -52,9 +52,8 @@ _reset_atm_diagnostic_flag(void) } } -T_DECL(toggle_atm_diagnostic_flag, - "change the atm_diagnostic_flag, which should use the commpage", - T_META_ASROOT(true)) +static void +_toggle_atm_diagnostic_flag(void) { T_ATEND(_reset_atm_diagnostic_flag); uint32_t f = _save_atm_diagnostic_flag(); @@ -65,18 +64,21 @@ T_DECL(toggle_atm_diagnostic_flag, "Ignoring host_set_atm_diagnostic_flag functionality. " "Bailing gracefully."); } - T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag"); + T_EXPECT_MACH_ERROR(KERN_NO_ACCESS, kr, + "Deny change to atm_diagnostic_flag"); +} + +T_DECL(atm_diagnostic_flag_unentitled_privileged, + "expect to fail to set the atm_diagnostic_flag (unentitled, privileged)", + T_META_ASROOT(true)) +{ + _toggle_atm_diagnostic_flag(); } -T_DECL(unprivileged_atm_diagnostic_flag, - "expect to fail to set the atm_diagnostic_flag", +T_DECL(atm_diagnostic_flag_unentitled_unprivileged, + "expect to fail to set the atm_diagnostic_flag (unentitled, unprivileged)", T_META_ASROOT(false)) { drop_priv(); - T_ATEND(_reset_atm_diagnostic_flag); - uint32_t f = _save_atm_diagnostic_flag(); - f ^= LIBTRACE_PRIVATE_DATA; - kern_return_t kr = _mutate_atm_diagnostic_flag(f); - T_EXPECT_MACH_ERROR(KERN_INVALID_ARGUMENT, kr, - "Deny change to atm_diagnostic_flag"); + _toggle_atm_diagnostic_flag(); } diff --git a/tests/atm_diagnostic_flag.entitlements b/tests/atm_diagnostic_flag.entitlements new file mode 100644 index 000000000..491a479c2 --- /dev/null +++ b/tests/atm_diagnostic_flag.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.set-atm-diagnostic-flag + + + diff --git a/tests/atm_diagnostic_flag_entitled.c b/tests/atm_diagnostic_flag_entitled.c new file mode 100644 index 000000000..30235c37b --- /dev/null +++ b/tests/atm_diagnostic_flag_entitled.c @@ -0,0 +1,83 @@ +#include + +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging")); + +/* + * The low 8 bits may be in use, so modify one + * of the upper 8 bits to ensure round-tripping. + */ +#define LIBTRACE_PRIVATE_DATA 0x01000000 + +extern void drop_priv(void); + +static bool _needs_reset; +static uint32_t _original; + +static uint32_t +_save_atm_diagnostic_flag(void) +{ + kern_return_t kr; + kr = host_get_atm_diagnostic_flag(mach_host_self(), &_original); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_get_atm_diagnostic_flag()"); + T_LOG("Original ATM diagnostic flag: 0x%08x", _original); + return _original; +} + +static kern_return_t +_mutate_atm_diagnostic_flag(uint32_t v) +{ + T_LOG("Try to set ATM diagnostic flag to: 0x%08x", v); + kern_return_t kr = host_set_atm_diagnostic_flag(mach_host_self(), v); + if (kr == KERN_SUCCESS) { + _needs_reset = true; + } + return kr; +} + +static void +_reset_atm_diagnostic_flag(void) +{ + if (!_needs_reset) { + return; + } + T_LOG("Reset ATM diagnostic flag to: 0x%08x", _original); + kern_return_t kr; + kr = host_set_atm_diagnostic_flag(mach_host_self(), _original); + if (kr != KERN_SUCCESS) { + T_ASSERT_FAIL("host_set_atm_diagnostic_flag() failed: %s", + mach_error_string(kr)); + } +} + +static void +_toggle_atm_diagnostic_flag(void) +{ + T_ATEND(_reset_atm_diagnostic_flag); + uint32_t f = _save_atm_diagnostic_flag(); + f ^= LIBTRACE_PRIVATE_DATA; + kern_return_t kr = _mutate_atm_diagnostic_flag(f); + if (kr == KERN_NOT_SUPPORTED) { + T_SKIP("Seems ATM is disabled on this platform. " + "Ignoring host_set_atm_diagnostic_flag functionality. " + "Bailing gracefully."); + } + T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag"); +} + +T_DECL(atm_diagnostic_flag_entitled_privileged, + "change the atm_diagnostic_flag (entitled, privileged)", + T_META_ASROOT(true)) +{ + _toggle_atm_diagnostic_flag(); +} + +T_DECL(atm_diagnostic_flag_entitled_unprivileged, + "change the atm_diagnostic_flag (entitled, unprivileged)", + T_META_ASROOT(false)) +{ + drop_priv(); + _toggle_atm_diagnostic_flag(); +} diff --git a/tests/monotonic_uncore.c b/tests/monotonic_uncore.c new file mode 100644 index 000000000..0274bbc62 --- /dev/null +++ b/tests/monotonic_uncore.c @@ -0,0 +1,418 @@ +/* + * Must come before including darwintest.h + */ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif /* defined(T_NAMESPACE) */ + +#include +#include +#include +#ifndef PRIVATE +/* + * Need new CPU families. + */ +#define PRIVATE +#include +#undef PRIVATE +#else /* !defined(PRIVATE) */ +#include +#endif /* defined(PRIVATE) */ +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.monotonic"), + T_META_CHECK_LEAKS(false), + T_META_ENABLED(false) + ); + +static bool +device_supports_uncore(void) +{ + int r; + int type, subtype; + unsigned int family; + size_t size = sizeof(type); + + /* + * Only arm64 Monsoon devices support uncore counters. + */ + + r = sysctlbyname("hw.cputype", &type, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cputype\")"); + r = sysctlbyname("hw.cpusubtype", &subtype, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cpusubtype\")"); + r = sysctlbyname("hw.cpufamily", &family, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"hw.cpufamily\")"); + + if (type == CPU_TYPE_ARM64 && + subtype == CPU_SUBTYPE_ARM64_V8 && + (family == CPUFAMILY_ARM_MONSOON_MISTRAL || + family == CPUFAMILY_ARM_VORTEX_TEMPEST)) { + return true; + } + + return false; +} + +#define UNCORE_DEV_PATH "/dev/monotonic/uncore" + +static int +open_uncore_error(int *error) +{ + guardid_t guard; + int fd; + + guard = 0xa5adcafe; + + T_SETUPBEGIN; + + fd = guarded_open_np(UNCORE_DEV_PATH, &guard, + GUARD_CLOSE | GUARD_DUP | GUARD_WRITE, O_CLOEXEC | O_EXCL); + if (fd < 0 && errno == ENOENT) { + T_ASSERT_FALSE(device_supports_uncore(), + "lack of dev node implies no uncore support"); + T_SKIP("uncore counters are unsupported"); + __builtin_unreachable(); + } + + if (error == NULL) { + T_ASSERT_POSIX_SUCCESS(fd, "open '%s'", UNCORE_DEV_PATH); + } else { + *error = errno; + } + + T_SETUPEND; + + return fd; +} + +static void +uncore_counts(int fd, uint64_t ctr_mask, uint64_t *counts) +{ + int r; + union monotonic_ctl_counts *cts_ctl; + + cts_ctl = (union monotonic_ctl_counts *)counts; + cts_ctl->in.ctr_mask = ctr_mask; + + r = ioctl(fd, MT_IOC_COUNTS, cts_ctl); + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "MT_IOC_COUNTS got counter values"); +} + +#define REF_TIMEBASE_EVENT 0x3 +#define CTRS_MAX 32 + +T_DECL(uncore_max_counters, + "ensure that the maximum number of uncore countes is sane", + T_META_ASROOT(true)) +{ + int nctrs = 0; + int fd; + + fd = open_uncore_error(NULL); + + do { + union monotonic_ctl_add add_ctl; + int r; + + add_ctl.in.config.event = REF_TIMEBASE_EVENT; + add_ctl.in.config.allowed_ctr_mask = UINT64_MAX; + + r = ioctl(fd, MT_IOC_ADD, &add_ctl); + if (r < 0 && errno == E2BIG) { + break; + } + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(r, "added reference timebase event to counters"); + nctrs++; + } while (nctrs < CTRS_MAX); + + T_EXPECT_LT(nctrs, CTRS_MAX, + "only able to allocate a reasonable number of counters"); +} + +static uint32_t +uncore_add(int fd, uint64_t event, uint64_t allowed_ctrs, int error) +{ + int save_errno; + int r; + uint32_t ctr; + union monotonic_ctl_add add_ctl; + + add_ctl.in.config.event = event; + add_ctl.in.config.allowed_ctr_mask = allowed_ctrs; + r = ioctl(fd, MT_IOC_ADD, &add_ctl); + if (error) { + save_errno = errno; + T_EXPECT_LT(r, 0, "adding event to counter should fail"); + T_EXPECT_EQ(save_errno, error, + "adding event to counter should fail with %d: %s", + error, strerror(error)); + return UINT32_MAX; + } else { + T_QUIET; + T_ASSERT_POSIX_SUCCESS(r, + "added event %#" PRIx64 " to counters", event); + } + + ctr = add_ctl.out.ctr; + T_QUIET; T_ASSERT_LT(ctr, (uint32_t)CTRS_MAX, "counter returned should be sane"); + return ctr; +} + +T_DECL(uncore_collision, + "ensure that trying to add an event on the same counter fails", + T_META_ASROOT(true)) +{ + int fd; + uint32_t ctr; + + fd = open_uncore_error(NULL); + + ctr = uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, 0); + T_LOG("added event to uncore counter %d\n", ctr); + + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1) << ctr, ENOSPC); +} + +static void +uncore_enable(int fd) +{ + union monotonic_ctl_enable en_ctl = { + .in = { .enable = true } + }; + + T_ASSERT_POSIX_SUCCESS(ioctl(fd, MT_IOC_ENABLE, &en_ctl), + "enabling counters"); +} + +T_DECL(uncore_enabled_busy, + "ensure that trying to add an event while enabled fails", + T_META_ASROOT(true)) +{ + int fd; + + fd = open_uncore_error(NULL); + + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, 0); + + uncore_enable(fd); + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_MAX, EBUSY); +} + +T_DECL(uncore_reset, + "ensure that resetting the counters works") +{ + int fd; + int r; + + fd = open_uncore_error(NULL); + + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), 0); + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), ENOSPC); + + r = ioctl(fd, MT_IOC_RESET); + T_ASSERT_POSIX_SUCCESS(r, "resetting succeeds"); + + T_LOG("adding event to same counter after reset"); + (void)uncore_add(fd, REF_TIMEBASE_EVENT, UINT64_C(1), 0); +} + +#define SLEEP_USECS (500 * 1000) + +static int +uncore_add_all(int fd, uint64_t event, int *nmonitors) +{ + int nctrs = 0; + int r; + + do { + union monotonic_ctl_add add_ctl; + + add_ctl.in.config.event = event; + add_ctl.in.config.allowed_ctr_mask = UINT64_MAX; + + r = ioctl(fd, MT_IOC_ADD, &add_ctl); + if (r < 0 && errno == E2BIG) { + break; + } + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(r, "added event %#" PRIx64 " to counters", + event); + nctrs++; + } while (nctrs < CTRS_MAX); + + if (nmonitors) { + union monotonic_ctl_info info_ctl; + r = ioctl(fd, MT_IOC_GET_INFO, &info_ctl); + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "got info about uncore counters"); + + *nmonitors = (int)info_ctl.out.nmonitors; + } + + return nctrs; +} + +T_DECL(uncore_accuracy, + "ensure that the uncore counters count accurately", + T_META_ASROOT(true)) +{ + int fd; + int nctrs = 0; + int nmonitors = 0; + uint64_t ctr_mask; + uint64_t counts[2][CTRS_MAX]; + uint64_t times[2]; + + fd = open_uncore_error(NULL); + + /* + * The reference timebase event counts the same as mach_continuous_time + * (on hardware supporting uncore counters). Make sure that the counter + * is close to the values returned from the trap. + * + * Fill all the counters with this event. + */ + nctrs = uncore_add_all(fd, REF_TIMEBASE_EVENT, &nmonitors); + ctr_mask = (UINT64_C(1) << nctrs) - 1; + + T_LOG("added %d counters to check", nctrs); + + uncore_enable(fd); + + /* + * First, make sure there's an upper bound on the counter -- take the + * time around getting the counter values. + */ + + times[0] = mach_absolute_time(); + uncore_counts(fd, ctr_mask, counts[0]); + + usleep(SLEEP_USECS); + + uncore_counts(fd, ctr_mask, counts[1]); + times[1] = mach_absolute_time(); + + T_QUIET; T_EXPECT_GT(times[1], times[0], + "mach_continuous_time is monotonically increasing"); + for (int i = 0; i < nctrs; i++) { + T_EXPECT_GT(counts[1][i], counts[0][i], + "uncore counter %d value is monotonically increasing", i); + T_EXPECT_LT(counts[1][i] - counts[0][i], times[1] - times[0], + "reference timebase on uncore counter %d satisfies upper bound " + "from mach_absolute_time", i); + } + + /* + * Next, the lower bound -- put mach_absolute_time inside getting the + * counter values. + */ + + uncore_counts(fd, ctr_mask, counts[0]); + times[0] = mach_absolute_time(); + + volatile int iterations = 100000; + while (iterations--) { + ; + } + + times[1] = mach_absolute_time(); + uncore_counts(fd, ctr_mask, counts[1]); + + for (int mon = 0; mon < nmonitors; mon++) { + for (int i = 0; i < nctrs; i++) { + T_QUIET; + T_EXPECT_GT(counts[1][i * mon], counts[0][i * mon], + "uncore %d counter %d value is monotonically increasing", + mon, i); + T_EXPECT_GT(counts[1][i * mon] - counts[0][i * mon], + times[1] - times[0], + "reference timebase on uncore %d counter %d satisfies " + "lower bound from mach_absolute_time", mon, i); + } + } +} + +T_DECL(uncore_ownership, + "ensure the dev node cannot be open in two places", + T_META_ASROOT(true)) +{ + int fd; + int other_fd; + int error; + + fd = open_uncore_error(NULL); + + other_fd = open_uncore_error(&error); + T_ASSERT_LT(other_fd, 0, "opening a second uncore fd should fail"); + T_ASSERT_EQ(error, EBUSY, "failure should be EBUSY"); +} + +T_DECL(uncore_root_required, + "ensure the dev node cannot be opened by non-root users", + T_META_ASROOT(false)) +{ + int fd; + int error = 0; + + T_SKIP("libdarwintest doesn't drop privileges properly"); + + fd = open_uncore_error(&error); + T_ASSERT_LT(fd, 0, "opening dev node should not return an fd"); + T_ASSERT_EQ(error, EPERM, + "opening dev node as non-root user should fail with EPERM"); +} + +T_DECL(perf_uncore, + "measure the latency of accessing the counters", + T_META_TAG_PERF) +{ + int fd; + int nctrs; + int nmonitors; + int r; + uint64_t ctr_mask; + dt_stat_thread_instructions_t counts_instrs; + dt_stat_t counter_deltas; + + counts_instrs = dt_stat_thread_instructions_create("ioctl_counts"); + counter_deltas = dt_stat_create("abs_time", "between_each_counter"); + + fd = open_uncore_error(NULL); + + nctrs = uncore_add_all(fd, REF_TIMEBASE_EVENT, &nmonitors); + ctr_mask = (UINT64_C(1) << nctrs) - 1; + + uncore_enable(fd); + + do { + dt_stat_token token; + uint64_t counts[nctrs * nmonitors]; + union monotonic_ctl_counts *cts_ctl; + + cts_ctl = (union monotonic_ctl_counts *)counts; + cts_ctl->in.ctr_mask = ctr_mask; + + token = dt_stat_thread_instructions_begin(counts_instrs); + r = ioctl(fd, MT_IOC_COUNTS, cts_ctl); + dt_stat_thread_instructions_end(counts_instrs, token); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(r, + "getting uncore counter values %#" PRIx64, ctr_mask); + + for (int i = 0; i < (nctrs - 1); i++) { + dt_stat_add(counter_deltas, (double)(counts[i + 1] - counts[i])); + } + } while (!dt_stat_stable(counts_instrs) || !dt_stat_stable(counter_deltas)); + + dt_stat_finalize(counts_instrs); + dt_stat_finalize(counter_deltas); +}