diff --git a/Makefile b/Makefile index 7fbedaceb..2a4b9ab19 100644 --- a/Makefile +++ b/Makefile @@ -100,12 +100,25 @@ installsrc: pax -rw . $(SRCROOT) else ifeq ($(RC_ProjectName),xnu_quick_test) +# This rule should be removed once rdar://22820602 is complete. +default: install + +installhdrs: + +install: xnu_tests + +clean: + +installsrc: + pax -rw . $(SRCROOT) + +else ifeq ($(RC_ProjectName),xnu_tests) default: install installhdrs: -install: xnu_quick_test +install: xnu_tests clean: @@ -237,11 +250,11 @@ installhdrs_libkdd install_libkdd: "SDKROOT=$(SDKROOT)" -# "xnu_quick_test" and "testbots" are targets that can be invoked via a standalone -# "make xnu_quick_test" or via buildit/XBS with the RC_ProjectName=xnu_quick_test. +# "xnu_tests" and "testbots" are targets that can be invoked via a standalone +# "make xnu_tests" or via buildit/XBS with the RC_ProjectName=xnu_tests. # Define the target here in the outermost scope of the initial Makefile -xnu_quick_test: +xnu_tests xnu_quick_test: $(MAKE) -C $(SRCROOT)/tools/tests \ SRCROOT=$(SRCROOT)/tools/tests diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 06e938975..2e4672f23 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -758,7 +759,11 @@ SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, extern uint32_t deep_idle_rebase; SYSCTL_UINT(_machdep_tsc, OID_AUTO, deep_idle_rebase, - CTLFLAG_RW|CTLFLAG_KERN|CTLFLAG_LOCKED, &deep_idle_rebase, 0, ""); + CTLFLAG_RD|CTLFLAG_LOCKED, &deep_idle_rebase, 0, ""); +SYSCTL_QUAD(_machdep_tsc, OID_AUTO, at_boot, + CTLFLAG_RD|CTLFLAG_LOCKED, &tsc_at_boot, ""); +SYSCTL_QUAD(_machdep_tsc, OID_AUTO, rebase_abs_time, + CTLFLAG_RD|CTLFLAG_LOCKED, &tsc_rebase_abs_time, ""); SYSCTL_NODE(_machdep_tsc, OID_AUTO, nanotime, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "TSC to ns conversion"); diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index 143bc1983..b6fa4a276 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -1709,8 +1709,10 @@ hfs_recording_init(struct hfsmount *hfsmp) } cnid = filep->fileID; - /* Skip over journal files. */ - if (cnid == hfsmp->hfs_jnlfileid || cnid == hfsmp->hfs_jnlinfoblkid) { + /* Skip over journal files and the hotfiles B-Tree file. */ + if (cnid == hfsmp->hfs_jnlfileid + || cnid == hfsmp->hfs_jnlinfoblkid + || cnid == VTOC(hfsmp->hfc_filevp)->c_fileid) { continue; } /* @@ -2865,6 +2867,15 @@ hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) stage = HFC_ADOPTION; break; } + + // Jump straight to delete for some files... + if (key->fileID == VTOC(hfsmp->hfc_filevp)->c_fileid + || key->fileID == hfsmp->hfs_jnlfileid + || key->fileID == hfsmp->hfs_jnlinfoblkid + || key->fileID < kHFSFirstUserCatalogNodeID) { + goto delete; + } + /* * Aquire the vnode for this file. */ diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index a819362bb..71380f628 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -2847,6 +2847,26 @@ hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp) // XXXdbg #include +static hfsmount_t *hfs_mount_from_cwd(vfs_context_t ctx) +{ + vnode_t vp = vfs_context_cwd(ctx); + + if (!vp) + return NULL; + + /* + * We could use vnode_tag, but it is probably more future proof to + * compare fstypename. + */ + char fstypename[MFSNAMELEN]; + vnode_vfsname(vp, fstypename); + + if (strcmp(fstypename, "hfs")) + return NULL; + + return VTOHFS(vp); +} + /* * HFS filesystem related variables. */ @@ -2930,7 +2950,6 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, } else if (name[0] == HFS_ENABLE_JOURNALING) { // make the file system journaled... - vnode_t vp = vfs_context_cwd(context); vnode_t jvp; ExtendedVCB *vcb; struct cat_attr jnl_attr; @@ -2952,10 +2971,11 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, if (!kauth_cred_issuser(kauth_cred_get())) { return (EPERM); } - if (vp == NULLVP) - return EINVAL; - hfsmp = VTOHFS(vp); + hfsmp = hfs_mount_from_cwd(context); + if (!hfsmp) + return EINVAL; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { return EROFS; } @@ -2965,7 +2985,7 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, } if (hfsmp->jnl) { - printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp)); + printf("hfs: volume %s is already journaled!\n", hfsmp->vcbVN); return EAGAIN; } vcb = HFSTOVCB(hfsmp); @@ -3145,16 +3165,15 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, return 0; } else if (name[0] == HFS_DISABLE_JOURNALING) { // clear the journaling bit - vnode_t vp = vfs_context_cwd(context); - + /* Only root can disable journaling */ if (!kauth_cred_issuser(kauth_cred_get())) { return (EPERM); } - if (vp == NULLVP) - return EINVAL; - hfsmp = VTOHFS(vp); + hfsmp = hfs_mount_from_cwd(context); + if (!hfsmp) + return EINVAL; /* * Disabling journaling is disallowed on volumes with directory hard links @@ -3165,7 +3184,7 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, return EPERM; } - printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp)); + printf("hfs: disabling journaling for %s\n", hfsmp->vcbVN); hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); @@ -3197,34 +3216,6 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp)); vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); } - return 0; - } else if (name[0] == HFS_GET_JOURNAL_INFO) { - vnode_t vp = vfs_context_cwd(context); - off_t jnl_start, jnl_size; - - if (vp == NULLVP) - return EINVAL; - - /* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */ - if (proc_is64bit(current_proc())) - return EINVAL; - - hfsmp = VTOHFS(vp); - if (hfsmp->jnl == NULL) { - jnl_start = 0; - jnl_size = 0; - } else { - jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, HFSTOVCB(hfsmp)->blockSize) + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset; - jnl_size = hfsmp->jnl_size; - } - - if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) { - return error; - } - if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) { - return error; - } - return 0; } else if (name[0] == HFS_SET_PKG_EXTENSIONS) { diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index dac4b088f..a198b651e 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -2823,8 +2823,9 @@ int hfs_move_data(cnode_t *from_cp, cnode_t *to_cp, // Update to_cp's resource data if it has it filefork_t *to_rfork = to_cp->c_rsrcfork; if (to_rfork) { - to_rfork->ff_invalidranges = from_rfork->ff_invalidranges; - to_rfork->ff_data = from_rfork->ff_data; + TAILQ_SWAP(&to_rfork->ff_invalidranges, + &from_rfork->ff_invalidranges, rl_entry, rl_link); + to_rfork->ff_data = from_rfork->ff_data; // Deal with ubc_setsize hfs_rsrc_setsize(to_cp); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index d9b90aff2..4f871df32 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -367,6 +367,7 @@ extern int (*mountroot)(void); lck_grp_t * proc_lck_grp; lck_grp_t * proc_slock_grp; lck_grp_t * proc_fdmlock_grp; +lck_grp_t * proc_ucred_mlock_grp; lck_grp_t * proc_mlock_grp; lck_grp_attr_t * proc_lck_grp_attr; lck_attr_t * proc_lck_attr; @@ -452,6 +453,7 @@ bsd_init(void) #if CONFIG_FINE_LOCK_GROUPS proc_slock_grp = lck_grp_alloc_init("proc-slock", proc_lck_grp_attr); proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr); + proc_ucred_mlock_grp = lck_grp_alloc_init("proc-ucred-mlock", proc_lck_grp_attr); proc_mlock_grp = lck_grp_alloc_init("proc-mlock", proc_lck_grp_attr); #endif /* Allocate proc lock attribute */ @@ -467,12 +469,14 @@ bsd_init(void) proc_klist_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr); lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr); #else proc_list_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); proc_klist_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_mlock, proc_lck_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_fdmlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_ucred_mlock, proc_lck_grp, proc_lck_attr); lck_spin_init(&kernproc->p_slock, proc_lck_grp, proc_lck_attr); #endif diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 27ad69aa1..505fbf81d 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -134,7 +134,6 @@ int kdbg_readcurcpumap(user_addr_t, size_t *); int kdbg_readthrmap(user_addr_t, size_t *, vnode_t, vfs_context_t); int kdbg_readthrmap_v3(user_addr_t, size_t *, int); int kdbg_readcurthrmap(user_addr_t, size_t *); -int kdbg_getreg(kd_regtype *); int kdbg_setreg(kd_regtype *); int kdbg_setrtcdec(kd_regtype *); int kdbg_setpidex(kd_regtype *); @@ -2290,50 +2289,6 @@ kdbg_setreg(kd_regtype * kdr) return(ret); } -int -kdbg_getreg(__unused kd_regtype * kdr) -{ -#if 0 - int i,j, ret=0; - unsigned int val_1, val_2, val; - - switch (kdr->type) { - case KDBG_CLASSTYPE : - val_1 = (kdr->value1 & 0xff); - val_2 = val_1 + 1; - kdlog_beg = (val_1<<24); - kdlog_end = (val_2<<24); - kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); - break; - case KDBG_SUBCLSTYPE : - val_1 = (kdr->value1 & 0xff); - val_2 = (kdr->value2 & 0xff); - val = val_2 + 1; - kdlog_beg = ((val_1<<24) | (val_2 << 16)); - kdlog_end = ((val_1<<24) | (val << 16)); - kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); - break; - case KDBG_RANGETYPE : - kdlog_beg = (kdr->value1); - kdlog_end = (kdr->value2); - kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); - break; - case KDBG_TYPENONE : - kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdlog_beg = 0; - kdlog_end = 0; - break; - default : - ret = EINVAL; - break; - } -#endif /* 0 */ - return(EINVAL); -} - static int kdbg_write_to_vnode(caddr_t buffer, size_t size, vnode_t vp, vfs_context_t ctx, off_t file_offset) { @@ -3206,16 +3161,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = kdbg_setreg(&kd_Reg); break; case KERN_KDGETREG: - if (size < sizeof(kd_regtype)) { - ret = EINVAL; - break; - } - ret = kdbg_getreg(&kd_Reg); - if (copyout(&kd_Reg, where, sizeof(kd_regtype))) { - ret = EINVAL; - } kdbg_disable_bg_trace(); - + ret = EINVAL; break; case KERN_KDREADTR: ret = kdbg_read(where, sizep, NULL, NULL, RAW_VERSION1); diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index fbbfb752a..0d2a07e02 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -3598,7 +3598,7 @@ kauth_cred_get_with_ref(void) * Returns: (kauth_cred_t) Pointer to the process's * newly referenced credential * - * Locks: PROC_LOCK is held before taking the reference and released + * Locks: PROC_UCRED_LOCK is held before taking the reference and released * after the refeence is taken to protect the p_ucred field of * the process referred to by procp. * @@ -3620,10 +3620,10 @@ kauth_cred_proc_ref(proc_t procp) { kauth_cred_t cred; - proc_lock(procp); + proc_ucred_lock(procp); cred = proc_ucred(procp); kauth_cred_ref(cred); - proc_unlock(procp); + proc_ucred_unlock(procp); return(cred); } @@ -4456,7 +4456,7 @@ int kauth_proc_label_update(struct proc *p, struct label *label) DEBUG_CRED_CHANGE("kauth_proc_setlabel_unlocked CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -4464,7 +4464,7 @@ int kauth_proc_label_update(struct proc *p, struct label *label) * restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); /* try again */ @@ -4475,7 +4475,7 @@ int kauth_proc_label_update(struct proc *p, struct label *label) PROC_UPDATE_CREDS_ONPROC(p); mac_proc_set_enforce(p, MAC_ALL_ENFORCE); - proc_unlock(p); + proc_ucred_unlock(p); } break; } @@ -4536,7 +4536,7 @@ kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, DEBUG_CRED_CHANGE("kauth_proc_label_update_execve_unlocked CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -4544,7 +4544,7 @@ kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, * restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); /* try again */ @@ -4554,7 +4554,7 @@ kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); mac_proc_set_enforce(p, MAC_ALL_ENFORCE); - proc_unlock(p); + proc_ucred_unlock(p); } break; } diff --git a/bsd/kern/kern_cs.c b/bsd/kern/kern_cs.c index 1459a472f..fb3a2012c 100644 --- a/bsd/kern/kern_cs.c +++ b/bsd/kern/kern_cs.c @@ -71,6 +71,8 @@ #include +#include + unsigned long cs_procs_killed = 0; unsigned long cs_procs_invalidated = 0; @@ -84,16 +86,18 @@ const int cs_library_val_enable = 1; int cs_enforcement_panic=0; #if CONFIG_ENFORCE_SIGNED_CODE -int cs_enforcement_enable = 1; +#define DEFAULT_CS_ENFORCEMENT_ENABLE 1 #else -int cs_enforcement_enable = 0; +#define DEFAULT_CS_ENFORCEMENT_ENABLE 0 #endif +SECURITY_READ_ONLY_LATE(int) cs_enforcement_enable = DEFAULT_CS_ENFORCEMENT_ENABLE; #if CONFIG_ENFORCE_LIBRARY_VALIDATION -int cs_library_val_enable = 1; +#define DEFAULT_CS_LIBRARY_VA_ENABLE 1 #else -int cs_library_val_enable = 0; +#define DEFAULT_CS_LIBRARY_VA_ENABLE 0 #endif +SECURITY_READ_ONLY_LATE(int) cs_library_val_enable = DEFAULT_CS_LIBRARY_VA_ENABLE; #endif /* !SECURE_KERNEL */ int cs_all_vnodes = 0; diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index ff5d6dda6..7b3e2440e 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -1255,6 +1255,7 @@ forkproc(proc_t parent_proc) #if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr); #if CONFIG_DTRACE lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); #endif @@ -1262,6 +1263,7 @@ forkproc(proc_t parent_proc) #else /* !CONFIG_FINE_LOCK_GROUPS */ lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr); lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_ucred_mlock, proc_lck_grp, proc_lck_attr); #if CONFIG_DTRACE lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); #endif @@ -1391,6 +1393,7 @@ forkproc(proc_t parent_proc) void proc_lock(proc_t p) { + lck_mtx_assert(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(&p->p_mlock); } @@ -1424,6 +1427,18 @@ proc_list_unlock(void) lck_mtx_unlock(proc_list_mlock); } +void +proc_ucred_lock(proc_t p) +{ + lck_mtx_lock(&p->p_ucred_mlock); +} + +void +proc_ucred_unlock(proc_t p) +{ + lck_mtx_unlock(&p->p_ucred_mlock); +} + #include struct zone *uthread_zone; @@ -1555,6 +1570,12 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info, boolean_t is_corpse uthread_t uth = (uthread_t)uthread; proc_t p = (proc_t)bsd_info; +#if PROC_REF_DEBUG + if (__improbable(uthread_get_proc_refcount(uthread) != 0)) { + panic("uthread_cleanup called for uthread %p with uu_proc_refcount != 0", uthread); + } +#endif + if (uth->uu_lowpri_window || uth->uu_throttle_info) { /* * task is marked as a low priority I/O type diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 13dcc2607..97296a8a8 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -3093,24 +3093,34 @@ memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) { proc_t p; /* TODO - add a victim queue and push this into the main jetsam thread */ - p = proc_find(victim_pid); if (!p) { return FALSE; } - printf("memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n", - victim_pid, (p->p_comm ? p->p_comm : "(unknown)"), - jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages); - proc_list_lock(); + if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || + (p->p_listflag & P_LIST_EXITED) || + (p->p_memstat_state & P_MEMSTAT_ERROR)) { + proc_list_unlock(); + proc_rele(p); + return FALSE; + } + + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + if (memorystatus_jetsam_snapshot_count == 0) { memorystatus_init_jetsam_snapshot_locked(NULL,0); } memorystatus_update_jetsam_snapshot_entry_locked(p, cause); proc_list_unlock(); + + printf("memorystatus: specifically killing pid %d [%s] (%s %d) - memorystatus_available_pages: %d\n", + victim_pid, (p->p_comm ? p->p_comm : "(unknown)"), + jetsam_kill_cause_name[cause], p->p_memstat_effectivepriority, memorystatus_available_pages); + killed = memorystatus_do_kill(p, cause); proc_rele(p); diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 5df82a23f..e90c68c55 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -782,7 +782,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) (void)chgproccnt(ruid, 1); } - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -792,7 +792,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced! */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); /* * We didn't successfully switch to the new ruid, so decrement * the procs/uid count that we incremented above. @@ -811,7 +811,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); /* * If we've updated the ruid, decrement the count of procs running * under the previous ruid @@ -885,7 +885,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("seteuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -893,7 +893,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) * should restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); my_pcred = posix_cred_get(my_cred); @@ -904,7 +904,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); } break; } @@ -1030,7 +1030,7 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) (void)chgproccnt(ruid, 1); } - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -1040,7 +1040,7 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * Note: the kauth_cred_setresuid has consumed a reference to my_cred, it p_ucred != my_cred, then my_cred must not be dereferenced! */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); if (ruid != KAUTH_UID_NONE && chgproccnt_ok(p)) { /* * We didn't successfully switch to the new ruid, so decrement @@ -1059,7 +1059,7 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); if (ruid != KAUTH_UID_NONE && chgproccnt_ok(p)) { /* @@ -1155,7 +1155,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("setgid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -1163,7 +1163,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) * should restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); @@ -1174,7 +1174,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); } break; } @@ -1246,7 +1246,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("setegid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another thread * also changed the credential after we took our @@ -1254,7 +1254,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) * should restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); @@ -1265,7 +1265,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); } break; } @@ -1393,14 +1393,14 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) DEBUG_CRED_CHANGE("setregid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* need to protect for a race where another thread * also changed the credential after we took our * reference. If p_ucred has changed then we * should restart this again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); /* try again */ my_cred = kauth_cred_proc_ref(p); @@ -1411,7 +1411,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ - proc_unlock(p); + proc_ucred_unlock(p); } break; } @@ -1698,7 +1698,7 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused DEBUG_CRED_CHANGE("setgroups1(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); - proc_lock(p); + proc_ucred_lock(p); /* * We need to protect for a race where another * thread also changed the credential after we @@ -1707,7 +1707,7 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused * with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); kauth_cred_unref(&my_new_cred); my_cred = kauth_cred_proc_ref(p); /* try again */ @@ -1717,7 +1717,7 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); - proc_unlock(p); + proc_ucred_unlock(p); } break; } diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 5f3e5960c..bf5507903 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -367,7 +367,7 @@ cansignal(proc_t p, kauth_cred_t uc, proc_t q, int signum, int zombie) */ unsigned sigrestrict_arg = 0; -#if PLATFORM_WatchOS || PLATFORM_AppleTVOS +#if PLATFORM_WatchOS static int sigrestrictmask(void) { @@ -400,7 +400,7 @@ signal_is_restricted(proc_t p, int signum) (void)signum; return 0; } -#endif /* !(PLATFORM_WatchOS || PLATFORM_AppleTVOS) */ +#endif /* !PLATFORM_WatchOS */ /* * Returns: 0 Success @@ -430,12 +430,29 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) signum == SIGKILL || signum == SIGSTOP) return (EINVAL); - if ((error = signal_is_restricted(p, signum))) { - if (error == ENOTSUP) { - printf("%s(%d): denied attempt to register action for signal %d\n", - proc_name_address(p), proc_pid(p), signum); + if (uap->nsa) { + if (IS_64BIT_PROCESS(p)) { + struct __user64_sigaction __vec64; + error = copyin(uap->nsa, &__vec64, sizeof(__vec64)); + __sigaction_user64_to_kern(&__vec64, &__vec); + } else { + struct __user32_sigaction __vec32; + error = copyin(uap->nsa, &__vec32, sizeof(__vec32)); + __sigaction_user32_to_kern(&__vec32, &__vec); + } + if (error) + return (error); + __vec.sa_flags &= SA_USERSPACE_MASK; /* Only pass on valid sa_flags */ + + if ((__vec.sa_flags & SA_SIGINFO) || __vec.sa_handler != SIG_DFL) { + if ((error = signal_is_restricted(p, signum))) { + if (error == ENOTSUP) { + printf("%s(%d): denied attempt to register action for signal %d\n", + proc_name_address(p), proc_pid(p), signum); + } + return error; + } } - return error; } if (uap->osa) { @@ -460,35 +477,21 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) if (IS_64BIT_PROCESS(p)) { struct user64_sigaction vec64; - sigaction_kern_to_user64(sa, &vec64); error = copyout(&vec64, uap->osa, sizeof(vec64)); } else { struct user32_sigaction vec32; - sigaction_kern_to_user32(sa, &vec32); error = copyout(&vec32, uap->osa, sizeof(vec32)); } if (error) return (error); } + if (uap->nsa) { - if (IS_64BIT_PROCESS(p)) { - struct __user64_sigaction __vec64; - - error = copyin(uap->nsa, &__vec64, sizeof(__vec64)); - __sigaction_user64_to_kern(&__vec64, &__vec); - } else { - struct __user32_sigaction __vec32; - - error = copyin(uap->nsa, &__vec32, sizeof(__vec32)); - __sigaction_user32_to_kern(&__vec32, &__vec); - } - if (error) - return (error); - __vec.sa_flags &= SA_USERSPACE_MASK; /* Only pass on valid sa_flags */ error = setsigvec(p, current_thread(), signum, &__vec, FALSE); } + return (error); } @@ -1714,6 +1717,18 @@ threadsignal(thread_t sig_actthread, int signum, mach_exception_code_t code) signal_setast(sig_actthread); } +/* + * get_signalthread + * + * Picks an appropriate thread from a process to target with a signal. + * + * Called with proc locked. + * Returns thread with BSD ast set. + * + * We attempt to deliver a proc-wide signal to the first thread in the task. + * This allows single threaded applications which use signals to + * be able to be linked with multithreaded libraries. + */ static kern_return_t get_signalthread(proc_t p, int signum, thread_t * thr) { @@ -1735,19 +1750,15 @@ get_signalthread(proc_t p, int signum, thread_t * thr) return(KERN_FAILURE); } - proc_lock(p); - TAILQ_FOREACH(uth, &p->p_uthlist, uu_list) { if(((uth->uu_flag & UT_NO_SIGMASK)== 0) && (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) { if (check_actforsig(p->task, uth->uu_context.vc_thread, 1) == KERN_SUCCESS) { *thr = uth->uu_context.vc_thread; - proc_unlock(p); return(KERN_SUCCESS); } } } - proc_unlock(p); if (get_signalact(p->task, thr, 1) == KERN_SUCCESS) { return(KERN_SUCCESS); } @@ -1775,7 +1786,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) user_addr_t action = USER_ADDR_NULL; proc_t sig_proc; thread_t sig_thread; - register task_t sig_task; + task_t sig_task; int mask; struct uthread *uth; kern_return_t kret; @@ -1784,7 +1795,8 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) kauth_cred_t my_cred; if ((u_int)signum >= NSIG || signum == 0) - panic("psignal signal number"); + panic("psignal: bad signal number %d", signum); + mask = sigmask(signum); prop = sigprop[signum]; @@ -1814,12 +1826,13 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) sig_thread = thread; sig_proc = (proc_t)get_bsdtask_info(sig_task); } else if (flavor & PSIG_TRY_THREAD) { + assert((thread == current_thread()) && (p == current_proc())); sig_task = p->task; sig_thread = thread; sig_proc = p; } else { sig_task = p->task; - sig_thread = (struct thread *)0; + sig_thread = THREAD_NULL; sig_proc = p; } @@ -1833,9 +1846,10 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * also no need to send a signal to a process that is in the middle * of being torn down. */ - if (ISSET(sig_proc->p_flag, P_REBOOT) || - ISSET(sig_proc->p_lflag, P_LEXIT)) + if (ISSET(sig_proc->p_flag, P_REBOOT) || ISSET(sig_proc->p_lflag, P_LEXIT)) { + DTRACE_PROC3(signal__discard, thread_t, sig_thread, proc_t, sig_proc, int, signum); return; + } if( (flavor & (PSIG_VFORK | PSIG_THREAD)) == 0) { proc_knote(sig_proc, NOTE_SIGNAL | signum); @@ -1844,22 +1858,22 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if ((flavor & PSIG_LOCKED)== 0) proc_signalstart(sig_proc, 0); - /* - * Deliver the signal to the first thread in the task. This - * allows single threaded applications which use signals to - * be able to be linked with multithreaded libraries. We have - * an implicit reference to the current thread, but need - * an explicit one otherwise. The thread reference keeps - * the corresponding task data structures around too. This - * reference is released by thread_deallocate. - */ - - + /* Don't send signals to a process that has ignored them. */ if (((flavor & PSIG_VFORK) == 0) && ((sig_proc->p_lflag & P_LTRACED) == 0) && (sig_proc->p_sigignore & mask)) { DTRACE_PROC3(signal__discard, thread_t, sig_thread, proc_t, sig_proc, int, signum); - goto psigout; + goto sigout_unlocked; } + /* + * The proc_lock prevents the targeted thread from being deallocated + * or handling the signal until we're done signaling it. + * + * Once the proc_lock is dropped, we have no guarantee the thread or uthread exists anymore. + * + * XXX: What if the thread goes inactive after the thread passes bsd ast point? + */ + proc_lock(sig_proc); + if (flavor & PSIG_VFORK) { action = SIG_DFL; act_set_astbsd(sig_thread); @@ -1881,11 +1895,11 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) /* If successful return with ast set */ kret = get_signalthread(sig_proc, signum, &sig_thread); } + if (kret != KERN_SUCCESS) { -#if SIGNAL_DEBUG - ram_printf(1); -#endif /* SIGNAL_DEBUG */ - goto psigout; + DTRACE_PROC3(signal__discard, thread_t, sig_thread, proc_t, sig_proc, int, signum); + proc_unlock(sig_proc); + goto sigout_unlocked; } uth = get_bsdthread_info(sig_thread); @@ -1906,7 +1920,8 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * action will be SIG_DFL here.) */ if (sig_proc->p_sigignore & mask) - goto psigout; + goto sigout_locked; + if (uth->uu_sigwait & mask) action = KERN_SIG_WAIT; else if (uth->uu_sigmask & mask) @@ -1918,8 +1933,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) } } - proc_lock(sig_proc); - + /* TODO: p_nice isn't hooked up to the scheduler... */ if (sig_proc->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) && (sig_proc->p_lflag & P_LTRACED) == 0) sig_proc->p_nice = NZERO; @@ -1935,41 +1949,33 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ - proc_unlock(sig_proc); pg = proc_pgrp(sig_proc); if (prop & SA_TTYSTOP && pg->pg_jobc == 0 && action == SIG_DFL) { pg_rele(pg); - goto psigout; + goto sigout_locked; } pg_rele(pg); - proc_lock(sig_proc); uth->uu_siglist &= ~contsigmask; } uth->uu_siglist |= mask; - /* - * Repost AST incase sigthread has processed - * ast and missed signal post. - */ - if (action == KERN_SIG_CATCH) - act_set_astbsd(sig_thread); - /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ /* vfork will not go thru as action is SIG_DFL */ - if ((action == KERN_SIG_HOLD) && ((prop & SA_CONT) == 0 || sig_proc->p_stat != SSTOP)) { - proc_unlock(sig_proc); - goto psigout; - } + if ((action == KERN_SIG_HOLD) && ((prop & SA_CONT) == 0 || sig_proc->p_stat != SSTOP)) + goto sigout_locked; + /* * SIGKILL priority twiddling moved here from above because * it needs sig_thread. Could merge it into large switch * below if we didn't care about priority for tracing * as SIGKILL's action is always SIG_DFL. + * + * TODO: p_nice isn't hooked up to the scheduler... */ if ((signum == SIGKILL) && (sig_proc->p_nice > NZERO)) { sig_proc->p_nice = NZERO; @@ -1983,11 +1989,10 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (sig_proc->p_lflag & P_LTRACED) { if (sig_proc->p_stat != SSTOP) goto runlocked; - else { - proc_unlock(sig_proc); - goto psigout; - } + else + goto sigout_locked; } + if ((flavor & PSIG_VFORK) != 0) goto runlocked; @@ -2013,13 +2018,9 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (prop & SA_CONT) { OSBitOrAtomic(P_CONTINUED, &sig_proc->p_flag); sig_proc->p_contproc = current_proc()->p_pid; - - proc_unlock(sig_proc); (void) task_resume_internal(sig_task); - goto psigout; } - proc_unlock(sig_proc); - goto psigout; + goto sigout_locked; } if (action != SIG_DFL) { @@ -2030,13 +2031,10 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ if (prop & SA_CONT) { OSBitOrAtomic(P_CONTINUED, &sig_proc->p_flag); - proc_unlock(sig_proc); (void) task_resume_internal(sig_task); - proc_lock(sig_proc); sig_proc->p_stat = SRUN; } else if (sig_proc->p_stat == SSTOP) { - proc_unlock(sig_proc); - goto psigout; + goto sigout_locked; } /* * Fill out siginfo structure information to pass to the @@ -2051,9 +2049,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * Note: Avoid the SIGCHLD recursion case! */ if (signum != SIGCHLD) { - proc_unlock(sig_proc); r_uid = kauth_getruid(); - proc_lock(sig_proc); sig_proc->si_pid = current_proc()->p_pid; sig_proc->si_status = W_EXITCODE(signum, 0); @@ -2073,14 +2069,13 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * stopped from the keyboard. */ if (!(prop & SA_STOP) && sig_proc->p_pptr == initproc) { - proc_unlock(sig_proc); - psignal_locked(sig_proc, SIGKILL); - proc_lock(sig_proc); uth->uu_siglist &= ~mask; proc_unlock(sig_proc); - goto psigout; + /* siglock still locked, proc_lock not locked */ + psignal_locked(sig_proc, SIGKILL); + goto sigout_unlocked; } - + /* * Stop the task * if task hasn't already been stopped by @@ -2119,19 +2114,18 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) psignal(pp, SIGCHLD); } - if (pp != PROC_NULL) + if (pp != PROC_NULL) { proc_parentdropref(pp, 0); - } else - proc_unlock(sig_proc); - goto psigout; + } + + goto sigout_unlocked; + } + + goto sigout_locked; } DTRACE_PROC3(signal__send, thread_t, sig_thread, proc_t, p, int, signum); - /* - * enters switch with sig_proc lock held but dropped when - * gets out of switch - */ switch (signum) { /* * Signals ignored by default have been dealt @@ -2157,9 +2151,8 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ act_set_astbsd(sig_thread); thread_abort(sig_thread); - proc_unlock(sig_proc); - goto psigout; + goto sigout_locked; case SIGCONT: /* @@ -2169,9 +2162,8 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) OSBitOrAtomic(P_CONTINUED, &sig_proc->p_flag); sig_proc->p_contproc = sig_proc->p_pid; - proc_unlock(sig_proc); (void) task_resume_internal(sig_task); - proc_lock(sig_proc); + /* * When processing a SIGCONT, we need to check * to see if there are signals pending that @@ -2190,8 +2182,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) uth->uu_siglist &= ~mask; sig_proc->p_stat = SRUN; - proc_unlock(sig_proc); - goto psigout; + goto sigout_locked; default: /* @@ -2201,9 +2192,8 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ if (((flavor & (PSIG_VFORK|PSIG_THREAD)) == 0) && (action == SIG_DFL) && (prop & SA_KILL)) { sig_proc->p_stat = SRUN; - proc_unlock(sig_proc); thread_abort(sig_thread); - goto psigout; + goto sigout_locked; } /* @@ -2211,8 +2201,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * resume it. */ if (sig_proc->p_stat == SSTOP) { - proc_unlock(sig_proc); - goto psigout; + goto sigout_locked; } goto runlocked; } @@ -2226,22 +2215,25 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ if (sig_proc->p_stat == SSTOP) { if ((sig_proc->p_lflag & P_LTRACED) != 0 && sig_proc->p_xstat != 0) - uth->uu_siglist |= sigmask(sig_proc->p_xstat); + uth->uu_siglist |= sigmask(sig_proc->p_xstat); + if ((flavor & PSIG_VFORK) != 0) { sig_proc->p_stat = SRUN; } - proc_unlock(sig_proc); } else { /* * setrunnable(p) in BSD and * Wake up the thread if it is interruptible. */ sig_proc->p_stat = SRUN; - proc_unlock(sig_proc); if ((flavor & PSIG_VFORK) == 0) thread_abort_safely(sig_thread); } -psigout: + +sigout_locked: + proc_unlock(sig_proc); + +sigout_unlocked: if ((flavor & PSIG_LOCKED)== 0) { proc_signalend(sig_proc, 0); } diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index cfbd3c99f..5da44690a 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -546,6 +546,18 @@ itimerfix( return (0); } +int +timespec_is_valid(const struct timespec *ts) +{ + /* The INT32_MAX limit ensures the timespec is safe for clock_*() functions + * which accept 32-bit ints. */ + if (ts->tv_sec < 0 || ts->tv_sec > INT32_MAX || + ts->tv_nsec < 0 || (unsigned long long)ts->tv_nsec > NSEC_PER_SEC) { + return 0; + } + return 1; +} + /* * Decrement an interval timer by a specified number * of microseconds, which must be less than a second, @@ -723,6 +735,15 @@ tvtoabstime( return (result + usresult); } +uint64_t +tstoabstime(struct timespec *ts) +{ + uint64_t abstime_s, abstime_ns; + clock_interval_to_absolutetime_interval(ts->tv_sec, NSEC_PER_SEC, &abstime_s); + clock_interval_to_absolutetime_interval(ts->tv_nsec, 1, &abstime_ns); + return abstime_s + abstime_ns; +} + #if NETWORKING /* * ratecheck(): simple time-based rate-limit checking. diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index a5da30245..e26393c52 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -139,6 +139,7 @@ int __attribute__ ((noinline)) proc_terminate(int pid, int32_t * retval); int __attribute__ ((noinline)) proc_pid_rusage(int pid, int flavor, user_addr_t buffer, int32_t * retval); int __attribute__ ((noinline)) proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int __attribute__ ((noinline)) proc_listcoalitions(int flavor, int coaltype, user_addr_t buffer, uint32_t buffersize, int32_t *retval); +int __attribute__ ((noinline)) proc_can_use_foreground_hw(int pid, user_addr_t reason, uint32_t resonsize, int32_t *retval); /* protos for procpidinfo calls */ int __attribute__ ((noinline)) proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); @@ -246,6 +247,8 @@ proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t b case PROC_INFO_CALL_LISTCOALITIONS: return proc_listcoalitions(pid /* flavor */, flavor /* coaltype */, buffer, buffersize, retval); + case PROC_INFO_CALL_CANUSEFGHW: + return proc_can_use_foreground_hw(pid, buffer, buffersize, retval); default: return(EINVAL); } @@ -1375,6 +1378,174 @@ int proc_listcoalitions(int flavor, int type, user_addr_t buffer, } +/*************************** proc_can_use_forgeound_hw **************************/ +int proc_can_use_foreground_hw(int pid, user_addr_t u_reason, uint32_t reasonsize, int32_t *retval) +{ + proc_t p = PROC_NULL; + int error = 0; + uint32_t reason = PROC_FGHW_ERROR; + uint32_t isBG = 0; + task_t task = TASK_NULL; +#if CONFIG_COALITIONS + coalition_t coal = COALITION_NULL; +#endif + + *retval = 0; + + if (pid <= 0) { + error = EINVAL; + reason = PROC_FGHW_ERROR; + goto out; + } + + p = proc_find(pid); + if (p == PROC_NULL) { + error = ESRCH; + reason = PROC_FGHW_ERROR; + goto out; + } + +#if CONFIG_COALITIONS + if (p != current_proc() && + !kauth_cred_issuser(kauth_cred_get())) { + error = EPERM; + reason = PROC_FGHW_ERROR; + goto out; + } + + task = p->task; + task_reference(task); + if (coalition_is_leader(task, COALITION_TYPE_JETSAM, &coal) == FALSE) { + /* current task is not a coalition leader: find the leader */ + task_deallocate(task); + task = coalition_get_leader(coal); + } + + if (task != TASK_NULL) { + /* + * If task is non-null, then it is the coalition leader of the + * current process' coalition. This could be the same task as + * the current_task, and that's OK. + */ + uint32_t flags = 0; + int role; + + proc_get_darwinbgstate(task, &flags); + if ((flags & PROC_FLAG_APPLICATION) != PROC_FLAG_APPLICATION) { + /* + * Coalition leader is not an application, continue + * searching for other ways this task could gain + * access to HW + */ + reason = PROC_FGHW_DAEMON_LEADER; + goto no_leader; + } + + if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) { + /* + * If the leader of the current process' coalition has + * been marked as DARWIN_BG, then it definitely should + * not be using foreground hardware resources. + */ + reason = PROC_FGHW_LEADER_BACKGROUND; + goto out; + } + + role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); + switch (role) { + case TASK_FOREGROUND_APPLICATION: /* DARWIN_ROLE_UI_FOCAL */ + case TASK_BACKGROUND_APPLICATION: /* DARWIN_ROLE_UI */ + /* + * The leader of this coalition is a focal, UI app: + * access granted + * TODO: should extensions/plugins be allowed to use + * this hardware? + */ + *retval = 1; + reason = PROC_FGHW_OK; + goto out; + case TASK_DEFAULT_APPLICATION: /* DARWIN_ROLE_UI_NON_FOCAL */ + case TASK_NONUI_APPLICATION: /* DARWIN_ROLE_NON_UI */ + case TASK_THROTTLE_APPLICATION: + case TASK_UNSPECIFIED: + default: + /* non-focal, non-ui apps don't get access */ + reason = PROC_FGHW_LEADER_NONUI; + goto out; + } + } + +no_leader: + if (task != TASK_NULL) { + task_deallocate(task); + task = TASK_NULL; + } +#endif /* CONFIG_COALITIONS */ + + /* + * There is no reasonable semantic to investigate the currently + * adopted voucher of an arbitrary thread in a non-current process. + * We return '0' + */ + if (p != current_proc()) { + error = EINVAL; + goto out; + } + + /* + * In the absence of coalitions, fall back to a voucher-based lookup + * where a daemon can used foreground HW if it's operating on behalf + * of a foreground application. + * NOTE: this is equivalent to a call to + * proc_pidoriginatorinfo(PROC_PIDORIGINATOR_BGSTATE, &isBG, sizeof(isBG)) + */ + isBG = 1; + error = proc_get_originatorbgstate(&isBG); + switch (error) { + case 0: + break; + case ESRCH: + reason = PROC_FGHW_NO_ORIGINATOR; + error = 0; + goto out; + case ENOATTR: + reason = PROC_FGHW_NO_VOUCHER_ATTR; + error = 0; + goto out; + case EINVAL: + reason = PROC_FGHW_DAEMON_NO_VOUCHER; + error = 0; + goto out; + default: + /* some other error occurred: report that to the caller */ + reason = PROC_FGHW_VOUCHER_ERROR; + goto out; + } + + if (isBG) { + reason = PROC_FGHW_ORIGINATOR_BACKGROUND; + error = 0; + } else { + /* + * The process itself is either a foreground app, or has + * adopted a voucher originating from an app that's still in + * the foreground + */ + reason = PROC_FGHW_DAEMON_OK; + *retval = 1; + } + +out: + if (task != TASK_NULL) + task_deallocate(task); + if (p != PROC_NULL) + proc_rele(p); + if (reasonsize >= sizeof(reason) && u_reason != (user_addr_t)0) + (void)copyout(&reason, u_reason, sizeof(reason)); + return error; +} + + /********************************** proc_pidinfo ********************************/ diff --git a/bsd/kern/sys_coalition.c b/bsd/kern/sys_coalition.c index a20ce301f..e35a8a878 100644 --- a/bsd/kern/sys_coalition.c +++ b/bsd/kern/sys_coalition.c @@ -216,10 +216,6 @@ coalition_info_resource_usage(coalition_t coal, user_addr_t buffer, user_size_t kern_return_t kr; struct coalition_resource_usage cru; - if (bufsize != sizeof(cru)) { - return EINVAL; - } - kr = coalition_resource_usage_internal(coal, &cru); switch (kr) { @@ -233,7 +229,7 @@ coalition_info_resource_usage(coalition_t coal, user_addr_t buffer, user_size_t return EIO; /* shrug */ } - return copyout(&cru, buffer, bufsize); + return copyout(&cru, buffer, MIN(bufsize, sizeof(cru))); } int coalition_info(proc_t p, struct coalition_info_args *uap, __unused int32_t *retval) diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index d6c46f58d..1fb49eb91 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -940,6 +940,7 @@ static int selscan(struct proc *p, struct _select * sel, struct _select_data * s static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count); static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount); static int seldrop(struct proc *p, u_int32_t *ibits, int nfd); +static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval); /* * Select system call. @@ -947,17 +948,126 @@ static int seldrop(struct proc *p, u_int32_t *ibits, int nfd); * Returns: 0 Success * EINVAL Invalid argument * EAGAIN Nonconformant error if allocation fails - * selprocess:??? */ int select(struct proc *p, struct select_args *uap, int32_t *retval) { __pthread_testcancel(1); - return(select_nocancel(p, (struct select_nocancel_args *)uap, retval)); + return select_nocancel(p, (struct select_nocancel_args *)uap, retval); } int select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval) +{ + uint64_t timeout = 0; + + if (uap->tv) { + int err; + struct timeval atv; + if (IS_64BIT_PROCESS(p)) { + struct user64_timeval atv64; + err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64)); + /* Loses resolution - assume timeout < 68 years */ + atv.tv_sec = atv64.tv_sec; + atv.tv_usec = atv64.tv_usec; + } else { + struct user32_timeval atv32; + err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32)); + atv.tv_sec = atv32.tv_sec; + atv.tv_usec = atv32.tv_usec; + } + if (err) + return err; + + if (itimerfix(&atv)) { + err = EINVAL; + return err; + } + + clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout); + } + + return select_internal(p, uap, timeout, retval); +} + +int +pselect(struct proc *p, struct pselect_args *uap, int32_t *retval) +{ + __pthread_testcancel(1); + return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval); +} + +int +pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval) +{ + int err; + struct uthread *ut; + uint64_t timeout = 0; + + if (uap->ts) { + struct timespec ts; + + if (IS_64BIT_PROCESS(p)) { + struct user64_timespec ts64; + err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64)); + ts.tv_sec = ts64.tv_sec; + ts.tv_nsec = ts64.tv_nsec; + } else { + struct user32_timespec ts32; + err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32)); + ts.tv_sec = ts32.tv_sec; + ts.tv_nsec = ts32.tv_nsec; + } + if (err) { + return err; + } + + if (!timespec_is_valid(&ts)) { + return EINVAL; + } + clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout); + } + + ut = get_bsdthread_info(current_thread()); + + if (uap->mask != USER_ADDR_NULL) { + /* save current mask, then copyin and set new mask */ + sigset_t newset; + err = copyin(uap->mask, &newset, sizeof(sigset_t)); + if (err) { + return err; + } + ut->uu_oldmask = ut->uu_sigmask; + ut->uu_flag |= UT_SAS_OLDMASK; + ut->uu_sigmask = (newset & ~sigcantmask); + } + + err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval); + + if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) { + /* + * Restore old mask (direct return case). NOTE: EINTR can also be returned + * if the thread is cancelled. In that case, we don't reset the signal + * mask to its original value (which usually happens in the signal + * delivery path). This behavior is permitted by POSIX. + */ + ut->uu_sigmask = ut->uu_oldmask; + ut->uu_oldmask = 0; + ut->uu_flag &= ~UT_SAS_OLDMASK; + } + + return err; +} + +/* + * Generic implementation of {,p}select. Care: we type-pun uap across the two + * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets) + * are identical. The 5th (timeout) argument points to different types, so we + * unpack in the syscall-specific code, but the generic code still does a null + * check on this argument to determine if a timeout was specified. + */ +static int +select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval) { int error = 0; u_int ni, nw; @@ -1049,32 +1159,7 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva getbits(ex, 2); #undef getbits - if (uap->tv) { - struct timeval atv; - if (IS_64BIT_PROCESS(p)) { - struct user64_timeval atv64; - error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64)); - /* Loses resolution - assume timeout < 68 years */ - atv.tv_sec = atv64.tv_sec; - atv.tv_usec = atv64.tv_usec; - } else { - struct user32_timeval atv32; - error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32)); - atv.tv_sec = atv32.tv_sec; - atv.tv_usec = atv32.tv_usec; - } - if (error) - goto continuation; - if (itimerfix(&atv)) { - error = EINVAL; - goto continuation; - } - - clock_absolutetime_interval_to_deadline( - tvtoabstime(&atv), &seldata->abstime); - } - else - seldata->abstime = 0; + seldata->abstime = timeout; if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) { goto continuation; @@ -1306,6 +1391,14 @@ selprocess(int error, int sel_pass) putbits(ex, 2); #undef putbits } + + if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) { + /* restore signal mask - continuation case */ + uth->uu_sigmask = uth->uu_oldmask; + uth->uu_oldmask = 0; + uth->uu_flag &= ~UT_SAS_OLDMASK; + } + return(error); } diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 5f58f9263..066065c6a 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -600,8 +600,8 @@ 391 AUE_NULL ALL { int enosys(void); } 392 AUE_NULL ALL { int enosys(void); } 393 AUE_NULL ALL { int enosys(void); } -394 AUE_NULL ALL { int enosys(void); } -395 AUE_NULL ALL { int enosys(void); } +394 AUE_SELECT ALL { int pselect(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, const struct timespec *ts, const struct sigset_t *mask) NO_SYSCALL_STUB; } +395 AUE_SELECT ALL { int pselect_nocancel(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, const struct timespec *ts, const struct sigset_t *mask) NO_SYSCALL_STUB; } 396 AUE_NULL ALL { user_ssize_t read_nocancel(int fd, user_addr_t cbuf, user_size_t nbyte) NO_SYSCALL_STUB; } 397 AUE_NULL ALL { user_ssize_t write_nocancel(int fd, user_addr_t cbuf, user_size_t nbyte) NO_SYSCALL_STUB; } 398 AUE_OPEN_RWTC ALL { int open_nocancel(user_addr_t path, int flags, int mode) NO_SYSCALL_STUB; } diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes index 2a5a36206..b37035dfe 100644 --- a/bsd/kern/trace.codes +++ b/bsd/kern/trace.codes @@ -223,6 +223,7 @@ 0x130048C MACH_vm_page_sleep 0x1300490 MACH_vm_page_expedite 0x13004c0 MACH_vm_pressure_event +0x1300500 MACH_vm_data_write 0x1400000 MACH_SCHED 0x1400004 MACH_STKATTACH 0x1400008 MACH_STKHANDOFF @@ -332,6 +333,7 @@ 0x1a30004 ENERGY_PERF_GPU_DESCRIPTION 0x1a30008 ENERGY_PERF_GPU_TIME 0x1a40000 SYSDIAGNOSE_notify_user +0x1a50000 ZALLOC_ZCRAM 0x2010000 L_IP_In_Beg 0x2010004 L_IP_Out_Beg 0x2010008 L_IP_In_End diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 0222981a2..7d7fbe7c6 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -2847,13 +2847,12 @@ ubc_cs_blob_add( error = cs_validate_csblob((const uint8_t *)addr, size, &cd); if (error) { - if (cs_debug) + + if (cs_debug) printf("CODESIGNING: csblob invalid: %d\n", error); - blob->csb_flags = 0; - blob->csb_start_offset = 0; - blob->csb_end_offset = 0; - memset(blob->csb_cdhash, 0, sizeof(blob->csb_cdhash)); - /* let the vnode checker determine if the signature is valid or not */ + /* The vnode checker can't make the rest of this function succeed if csblob validation failed, so bail */ + goto out; + } else { const unsigned char *md_base; uint8_t hash[CS_HASH_MAX_SIZE]; diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index d73d61a4b..ede272ba6 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -6804,9 +6804,14 @@ sockaddrentry_dup(const struct sockaddr_entry *src_se, int how) dst_se = sockaddrentry_alloc(how); if (dst_se != NULL) { int len = src_se->se_addr->sa_len; + /* + * Workaround for rdar://23362120 + * Allways allocate a buffer that can hold an IPv6 socket address + */ + size_t alloclen = MAX(len, sizeof(struct sockaddr_in6)); MALLOC(dst_se->se_addr, struct sockaddr *, - len, M_SONAME, how | M_ZERO); + alloclen, M_SONAME, how | M_ZERO); if (dst_se->se_addr != NULL) { bcopy(src_se->se_addr, dst_se->se_addr, len); } else { diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index f44291282..03330fbcb 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -2756,6 +2756,7 @@ getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr, { struct sockaddr *sa; int error; + size_t alloclen; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); @@ -2763,7 +2764,12 @@ getsockaddr(struct socket *so, struct sockaddr **namp, user_addr_t uaddr, if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); - MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK | M_ZERO); + /* + * Workaround for rdar://23362120 + * Allways allocate a buffer that can hold an IPv6 socket address + */ + alloclen = MAX(len, sizeof(struct sockaddr_in6)); + MALLOC(sa, struct sockaddr *, alloclen, M_SONAME, M_WAITOK | M_ZERO); if (sa == NULL) { return (ENOMEM); } diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 075ebeb16..047f85de3 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -140,6 +140,7 @@ DATAFILES = \ posix_madvise.2 \ pread.2 \ profil.2 \ + pselect.2 \ pthread_setugid_np.2 \ ptrace.2 \ pwrite.2 \ diff --git a/bsd/man/man2/pselect.2 b/bsd/man/man2/pselect.2 new file mode 100644 index 000000000..eaf91d030 --- /dev/null +++ b/bsd/man/man2/pselect.2 @@ -0,0 +1,122 @@ +.\" +.\" Copyright 2002 Massachusetts Institute of Technology +.\" +.\" Permission to use, copy, modify, and distribute this software and +.\" its documentation for any purpose and without fee is hereby +.\" granted, provided that both the above copyright notice and this +.\" permission notice appear in all copies, that both the above +.\" copyright notice and this permission notice appear in all +.\" supporting documentation, and that the name of M.I.T. not be used +.\" in advertising or publicity pertaining to distribution of the +.\" software without specific, written prior permission. M.I.T. makes +.\" no representations about the suitability of this software for any +.\" purpose. It is provided "as is" without express or implied +.\" warranty. +.\" +.\" THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS +.\" ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, +.\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +.\" SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +.\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +.\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +.\" USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +.\" ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +.\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD: src/lib/libc/gen/pselect.3,v 1.4 2002/12/18 10:13:54 ru Exp $ +.\" +.Dd June 16, 2002 +.Dt PSELECT 2 +.Os +.Sh NAME +.Nm pselect +.Nd synchronous I/O multiplexing a la POSIX.1g +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/select.h +.Ft int +.Fo pselect +.Fa "int nfds" +.Fa "fd_set *restrict readfds" +.Fa "fd_set *restrict writefds" +.Fa "fd_set *restrict errorfds" +.Fa "const struct timespec *restrict timeout" +.Fa "const sigset_t *restrict sigmask" +.Fc +.Sh DESCRIPTION +The +.Fn pselect +function was introduced by +.St -p1003.1g-2000 +as a slightly stronger version of +.Xr select 2 . +The +.Fa nfds , readfds , writefds , +and +.Fa errorfds +arguments are all identical to the analogous arguments of +.Fn select . +The +.Fa timeout +argument in +.Fn pselect +points to a +.Vt "const struct timespec" , +rather than the (modifiable) +.Vt "struct timeval" +used by +.Fn select ; +as in +.Fn select , +a null pointer may be passed to indicate that +.Fn pselect +should wait indefinitely. +Finally, +.Fa sigmask +specifies a signal mask which is set while waiting for input. +When +.Fn pselect +returns, the original signal mask is restored. +.Pp +See +.Xr select 2 +for a more detailed discussion of the semantics of this interface, and +for macros used to manipulate the +.Vt "fd_set" +data type. +.Sh RETURN VALUES +The +.Fn pselect +function returns the same values and under the same conditions as +.Fn select . +.Sh ERRORS +The +.Fn pselect +function may fail for any of the reasons documented for +.Xr select 2 +and (if a signal mask is provided) +.Xr sigprocmask 2 . +.Sh SEE ALSO +.Xr kqueue 2 , +.Xr poll 2 , +.Xr select 2 , +.Xr sigprocmask 2 +.Sh STANDARDS +The +.Fn pselect +function conforms to +.St -p1003.1-2001 . +.Sh HISTORY +The +.Fn pselect +function first appeared in +.Fx 5.0 . +.Sh AUTHORS +The +.Fn pselect +manual page was written by +.An Garrett Wollman Aq wollman@FreeBSD.org . diff --git a/bsd/net/classq/classq_sfb.c b/bsd/net/classq/classq_sfb.c index 7d12ba606..5831f968f 100644 --- a/bsd/net/classq/classq_sfb.c +++ b/bsd/net/classq/classq_sfb.c @@ -1251,6 +1251,7 @@ sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t) */ if (droptype == DTYPE_NODROP && qlen(q) >= maxqsize) { if (pkt->pkt_proto == IPPROTO_TCP && + qlen(q) < (maxqsize + (maxqsize >> 1)) && ((pkt->pkt_flags & PKTF_TCP_REXMT) || (sp->sfb_flags & SFBF_LAST_PKT_DROPPED))) { /* diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 5576af7d7..6d89331b1 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -1095,6 +1095,23 @@ dlil_alloc_local_stats(struct ifnet *ifp) ret = 0; } + if (ifp->if_ipv4_stat == NULL) { + MALLOC(ifp->if_ipv4_stat, struct if_tcp_ecn_stat *, + sizeof(struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); + if (ifp->if_ipv4_stat == NULL) { + ret = ENOMEM; + goto end; + } + } + + if (ifp->if_ipv6_stat == NULL) { + MALLOC(ifp->if_ipv6_stat, struct if_tcp_ecn_stat *, + sizeof(struct if_tcp_ecn_stat), M_TEMP, M_WAITOK|M_ZERO); + if (ifp->if_ipv6_stat == NULL) { + ret = ENOMEM; + goto end; + } + } end: if (ret != 0) { if (ifp->if_tcp_stat != NULL) { @@ -1109,6 +1126,14 @@ dlil_alloc_local_stats(struct ifnet *ifp) zfree(dlif_udpstat_zone, *pbuf); ifp->if_udp_stat = NULL; } + if (ifp->if_ipv4_stat != NULL) { + FREE(ifp->if_ipv4_stat, M_TEMP); + ifp->if_ipv4_stat = NULL; + } + if (ifp->if_ipv6_stat != NULL) { + FREE(ifp->if_ipv6_stat, M_TEMP); + ifp->if_ipv6_stat = NULL; + } } return (ret); @@ -5093,6 +5118,7 @@ ifproto_media_send_arp(struct ifnet *ifp, u_short arpop, } extern int if_next_index(void); +extern int tcp_ecn_outbound; errno_t ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) @@ -5414,6 +5440,16 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) } else { ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN; } + + /* + * Enable ECN capability on this interface depending on the + * value of ECN global setting + */ + if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) { + ifp->if_eflags |= IFEF_ECN_ENABLE; + ifp->if_eflags &= ~IFEF_ECN_DISABLE; + } + ifnet_lock_done(ifp); ifnet_head_done(); @@ -5711,6 +5747,14 @@ ifnet_detach(ifnet_t ifp) if (ifp->if_udp_stat != NULL) bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat)); + /* Reset ifnet IPv4 stats */ + if (ifp->if_ipv4_stat != NULL) + bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat)); + + /* Reset ifnet IPv6 stats */ + if (ifp->if_ipv6_stat != NULL) + bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat)); + /* Release memory held for interface link status report */ if (ifp->if_link_status != NULL) { FREE(ifp->if_link_status, M_TEMP); diff --git a/bsd/net/if.c b/bsd/net/if.c index d65efe3a1..57d48d21c 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -1992,7 +1992,9 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFINTERFACESTATE: /* struct ifreq */ case SIOCSIFPROBECONNECTIVITY: /* struct ifreq */ case SIOCGIFPROBECONNECTIVITY: /* struct ifreq */ - case SIOCGSTARTDELAY: { /* struct ifreq */ + case SIOCGSTARTDELAY: /* struct ifreq */ + case SIOCGECNMODE: /* struct ifreq */ + case SIOCSECNMODE: { /* struct ifreq */ struct ifreq ifr; bcopy(data, &ifr, sizeof (ifr)); ifr.ifr_name[IFNAMSIZ - 1] = '\0'; @@ -2780,6 +2782,28 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) else ifr->ifr_probe_connectivity = 0; break; + case SIOCGECNMODE: + if ((ifp->if_eflags & (IFEF_ECN_ENABLE|IFEF_ECN_DISABLE)) == + IFEF_ECN_ENABLE) + ifr->ifr_ecn_mode = IFRTYPE_ECN_ENABLE; + else if ((ifp->if_eflags & (IFEF_ECN_ENABLE|IFEF_ECN_DISABLE)) == + IFEF_ECN_DISABLE) + ifr->ifr_ecn_mode = IFRTYPE_ECN_DISABLE; + else + ifr->ifr_ecn_mode = IFRTYPE_ECN_DEFAULT; + break; + case SIOCSECNMODE: + if (ifr->ifr_ecn_mode == IFRTYPE_ECN_DEFAULT) { + ifp->if_eflags &= ~(IFEF_ECN_ENABLE|IFEF_ECN_DISABLE); + } else if (ifr->ifr_ecn_mode == IFRTYPE_ECN_ENABLE) { + ifp->if_eflags |= IFEF_ECN_ENABLE; + ifp->if_eflags &= ~IFEF_ECN_DISABLE; + } else if (ifr->ifr_ecn_mode == IFRTYPE_ECN_DISABLE) { + ifp->if_eflags |= IFEF_ECN_DISABLE; + ifp->if_eflags &= ~IFEF_ECN_ENABLE; + } else + error = EINVAL; + break; default: VERIFY(0); /* NOTREACHED */ @@ -4303,6 +4327,10 @@ ifioctl_cassert(void) case SIOCGIFAGENTDATA64: case SIOCSIFINTERFACESTATE: case SIOCGIFINTERFACESTATE: + case SIOCSIFPROBECONNECTIVITY: + case SIOCGIFPROBECONNECTIVITY: + case SIOCGECNMODE: + case SIOCSECNMODE: ; } } diff --git a/bsd/net/if.h b/bsd/net/if.h index 62afa9cd4..c2c99314b 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -182,6 +182,8 @@ struct if_clonereq32 { #define IFEF_NOACKPRI 0x00200000 /* No TCP ACK prioritization */ #define IFEF_AWDL_RESTRICTED 0x00400000 /* Restricted AWDL mode */ #define IFEF_2KCL 0x00800000 /* prefers 2K cluster (socket based tunnel) */ +#define IFEF_ECN_ENABLE 0x01000000 /* use ECN for TCP connections on the interface */ +#define IFEF_ECN_DISABLE 0x02000000 /* do not use ECN for TCP connections on the interface */ #define IFEF_SENDLIST 0x10000000 /* Supports tx packet lists */ #define IFEF_DIRECTLINK 0x20000000 /* point-to-point topology */ #define _IFEF_INUSE 0x40000000 /* deprecated */ @@ -486,6 +488,10 @@ struct ifreq { } ifru_start_delay; struct if_interface_state ifru_interface_state; u_int32_t ifru_probe_connectivity; + u_int32_t ifru_ecn_mode; +#define IFRTYPE_ECN_DEFAULT 0 +#define IFRTYPE_ECN_ENABLE 1 +#define IFRTYPE_ECN_DISABLE 2 #endif /* PRIVATE */ } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ @@ -528,6 +534,7 @@ struct ifreq { #define ifr_start_delay_timeout ifr_ifru.ifru_start_delay.timeout #define ifr_interface_state ifr_ifru.ifru_interface_state #define ifr_probe_connectivity ifr_ifru.ifru_probe_connectivity +#define ifr_ecn_mode ifr_ifru.ifru_ecn_mode #endif /* PRIVATE */ }; diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 29e253bc4..fc7ce88fb 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -327,6 +327,39 @@ struct if_rxpoll_stats { u_int64_t ifi_poll_interval_time; /* poll interval (nsec) */ }; +struct if_tcp_ecn_perf_stat { + u_int64_t rtt_avg; + u_int64_t rtt_var; + u_int64_t oo_percent; + u_int64_t sack_episodes; + u_int64_t reorder_percent; + u_int64_t rxmit_percent; + u_int64_t rxmit_drop; +}; + +struct if_tcp_ecn_stat { + u_int64_t timestamp; + u_int64_t ecn_client_setup; + u_int64_t ecn_server_setup; + u_int64_t ecn_client_success; + u_int64_t ecn_server_success; + u_int64_t ecn_peer_nosupport; + u_int64_t ecn_syn_lost; + u_int64_t ecn_synack_lost; + u_int64_t ecn_recv_ce; + u_int64_t ecn_recv_ece; + u_int64_t ecn_conn_recv_ce; + u_int64_t ecn_conn_recv_ece; + u_int64_t ecn_conn_plnoce; + u_int64_t ecn_conn_plce; + u_int64_t ecn_conn_noplce; + u_int64_t ecn_fallback_synloss; + u_int64_t ecn_fallback_reorder; + u_int64_t ecn_fallback_ce; + struct if_tcp_ecn_perf_stat ecn_on; + struct if_tcp_ecn_perf_stat ecn_off; +}; + /* * Interface link status report -- includes statistics related to * the link layer technology sent by the driver. The driver will monitor @@ -908,6 +941,8 @@ struct ifnet { decl_lck_rw_data(, if_link_status_lock); struct if_link_status *if_link_status; struct if_interface_state if_interface_state; + struct if_tcp_ecn_stat *if_ipv4_stat; + struct if_tcp_ecn_stat *if_ipv6_stat; }; #define IF_TCP_STATINC(_ifp, _s) do { \ @@ -1189,6 +1224,13 @@ struct ifmultiaddr { ((_ifp)->if_type == IFT_CELLULAR || \ (_ifp)->if_delegated.type == IFT_CELLULAR) +/* + * Indicate whether or not the immediate interface, or the interface delegated + * by it, is an ETHERNET interface. + */ +#define IFNET_IS_ETHERNET(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_ETHERNET || \ + (_ifp)->if_delegated.family == IFNET_FAMILY_ETHERNET) /* * Indicate whether or not the immediate interface, or the interface delegated * by it, is a Wi-Fi interface (IFNET_SUBFAMILY_WIFI). Delegated interface diff --git a/bsd/net/necp.c b/bsd/net/necp.c index 650e809cd..e40268c30 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -731,6 +731,13 @@ necp_packet_get_tlv_at_offset(mbuf_t packet, int tlv_offset, u_int32_t buff_len, return (error); } + u_int32_t total_len = m_length2(packet, NULL); + if (total_len < (tlv_offset + sizeof(u_int8_t) + sizeof(length) + length)) { + NECPLOG(LOG_ERR, "Got a bad TLV, length (%u) + offset (%d) < total length (%u)", + length, (tlv_offset + sizeof(u_int8_t) + sizeof(length)), total_len); + return (EINVAL); + } + if (value_size != NULL) { *value_size = length; } @@ -4857,7 +4864,10 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r goto done; } // Copy parameters in - copyin(uap->parameters, parameters, uap->parameters_size); + error = copyin(uap->parameters, parameters, uap->parameters_size); + if (error) { + goto done; + } error = necp_application_find_policy_match_internal(parameters, uap->parameters_size, &returned_result); if (error) { @@ -4865,7 +4875,10 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r } // Copy return value back - copyout(&returned_result, uap->returned_result, sizeof(struct necp_aggregate_result)); + error = copyout(&returned_result, uap->returned_result, sizeof(struct necp_aggregate_result)); + if (error) { + goto done; + } done: if (parameters != NULL) { FREE(parameters, M_NECP); diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index 8a295f887..9d42c7c6d 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -91,7 +91,6 @@ static struct nstat_stats nstat_stats; SYSCTL_STRUCT(_net_stats, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &nstat_stats, nstat_stats, ""); - enum { NSTAT_FLAG_CLEANUP = (1 << 0), @@ -155,6 +154,7 @@ static void nstat_control_cleanup_source(nstat_control_state *state, nstat_src static bool nstat_control_reporting_allowed(nstat_control_state *state, nstat_src *src); static boolean_t nstat_control_begin_query(nstat_control_state *state, const nstat_msg_hdr *hdrp); static u_int16_t nstat_control_end_query(nstat_control_state *state, nstat_src *last_src, boolean_t partial); +static void nstat_ifnet_report_ecn_stats(void); static u_int32_t nstat_udp_watchers = 0; static u_int32_t nstat_tcp_watchers = 0; @@ -2101,6 +2101,77 @@ nstat_ifnet_copy_link_status( lck_rw_done(&ifp->if_link_status_lock); } +static u_int64_t nstat_ifnet_last_report_time = 0; +extern int tcp_report_stats_interval; + +void +nstat_ifnet_report_ecn_stats(void) +{ + u_int64_t uptime, last_report_time; + struct nstat_sysinfo_data data; + struct nstat_sysinfo_ifnet_ecn_stats *st; + struct ifnet *ifp; + + uptime = net_uptime(); + + if ((int)(uptime - nstat_ifnet_last_report_time) < + tcp_report_stats_interval) + return; + + last_report_time = nstat_ifnet_last_report_time; + nstat_ifnet_last_report_time = uptime; + data.flags = NSTAT_SYSINFO_IFNET_ECN_STATS; + st = &data.u.ifnet_ecn_stats; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (ifp->if_ipv4_stat == NULL || ifp->if_ipv6_stat == NULL) + continue; + + if ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != + IFRF_ATTACHED) + continue; + + /* Limit reporting to Wifi, Ethernet and cellular. */ + if (!(IFNET_IS_ETHERNET(ifp) || IFNET_IS_CELLULAR(ifp))) + continue; + + bzero(st, sizeof(*st)); + if (IFNET_IS_CELLULAR(ifp)) { + st->ifnet_type = NSTAT_IFNET_ECN_TYPE_CELLULAR; + } else if (IFNET_IS_WIFI(ifp)) { + st->ifnet_type = NSTAT_IFNET_ECN_TYPE_WIFI; + } else { + st->ifnet_type = NSTAT_IFNET_ECN_TYPE_ETHERNET; + } + + /* skip if there was no update since last report */ + if (ifp->if_ipv4_stat->timestamp <= 0 || + ifp->if_ipv4_stat->timestamp < last_report_time) + goto v6; + st->ifnet_proto = NSTAT_IFNET_ECN_PROTO_IPV4; + bcopy(ifp->if_ipv4_stat, &st->ecn_stat, + sizeof(st->ecn_stat)); + nstat_sysinfo_send_data(&data); + bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat)); + +v6: + /* skip if there was no update since last report */ + if (ifp->if_ipv6_stat->timestamp <= 0 || + ifp->if_ipv6_stat->timestamp < last_report_time) + continue; + st->ifnet_proto = NSTAT_IFNET_ECN_PROTO_IPV6; + bcopy(ifp->if_ipv6_stat, &st->ecn_stat, + sizeof(st->ecn_stat)); + nstat_sysinfo_send_data(&data); + + /* Zero the stats in ifp */ + bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat)); + } + ifnet_head_done(); + +} + static errno_t nstat_ifnet_copy_descriptor( nstat_provider_cookie_t cookie, @@ -2210,6 +2281,14 @@ nstat_sysinfo_send_data_internal( nkeyvals = sizeof(struct nstat_sysinfo_tcp_stats) / sizeof(u_int32_t); break; + case NSTAT_SYSINFO_IFNET_ECN_STATS: + nkeyvals = (sizeof(struct if_tcp_ecn_stat) / + sizeof(u_int64_t)); + /* One less because we are not going to send timestamp */ + nkeyvals -= 1; + /* Two more keys for ifnet type and proto */ + nkeyvals += 2; + break; default: return; } @@ -2334,6 +2413,15 @@ nstat_sysinfo_send_data_internal( nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_ECN_CONN_NOPL_CE, data->u.tcp_stats.ecn_conn_nopl_ce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_FALLBACK_SYNLOSS, + data->u.tcp_stats.ecn_fallback_synloss); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_FALLBACK_REORDER, + data->u.tcp_stats.ecn_fallback_reorder); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_FALLBACK_CE, + data->u.tcp_stats.ecn_fallback_ce); nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_TFO_SYN_DATA_RCV, data->u.tcp_stats.tfo_syn_data_rcv); @@ -2368,6 +2456,110 @@ nstat_sysinfo_send_data_internal( VERIFY(i == nkeyvals); break; } + case NSTAT_SYSINFO_IFNET_ECN_STATS: + { + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_TYPE, + data->u.ifnet_ecn_stats.ifnet_type); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_PROTO, + data->u.ifnet_ecn_stats.ifnet_proto); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CLIENT_SETUP, + data->u.ifnet_ecn_stats.ecn_stat.ecn_client_setup); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_SERVER_SETUP, + data->u.ifnet_ecn_stats.ecn_stat.ecn_server_setup); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CLIENT_SUCCESS, + data->u.ifnet_ecn_stats.ecn_stat.ecn_client_success); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_SERVER_SUCCESS, + data->u.ifnet_ecn_stats.ecn_stat.ecn_server_success); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_PEER_NOSUPPORT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_peer_nosupport); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_SYN_LOST, + data->u.ifnet_ecn_stats.ecn_stat.ecn_syn_lost); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_SYNACK_LOST, + data->u.ifnet_ecn_stats.ecn_stat.ecn_synack_lost); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_RECV_CE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_recv_ce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_RECV_ECE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_recv_ece); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CONN_RECV_CE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_conn_recv_ce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CONN_RECV_ECE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_conn_recv_ece); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CONN_PLNOCE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_conn_plnoce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CONN_PLCE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_conn_plce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_CONN_NOPLCE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_conn_noplce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_FALLBACK_SYNLOSS, + data->u.ifnet_ecn_stats.ecn_stat.ecn_fallback_synloss); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_FALLBACK_REORDER, + data->u.ifnet_ecn_stats.ecn_stat.ecn_fallback_reorder); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_FALLBACK_CE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_fallback_ce); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_RTT_AVG, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.rtt_avg); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_RTT_VAR, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.rtt_var); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_OOPERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.oo_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_SACK_EPISODE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.sack_episodes); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_REORDER_PERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.reorder_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_RXMIT_PERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.rxmit_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_ON_RXMIT_DROP, + data->u.ifnet_ecn_stats.ecn_stat.ecn_on.rxmit_drop); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_RTT_AVG, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.rtt_avg); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_RTT_VAR, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.rtt_var); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_OOPERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.oo_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_SACK_EPISODE, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.sack_episodes); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_REORDER_PERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.reorder_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_RXMIT_PERCENT, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.rxmit_percent); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_ECN_IFNET_OFF_RXMIT_DROP, + data->u.ifnet_ecn_stats.ecn_stat.ecn_off.rxmit_drop); + VERIFY(i == nkeyvals); + break; + } } if (syscnt != NULL) @@ -2407,6 +2599,7 @@ nstat_sysinfo_generate_report(void) { mbuf_report_peak_usage(); tcp_report_stats(); + nstat_ifnet_report_ecn_stats(); } #pragma mark -- Kernel Control Socket -- diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index a6bcec06b..1d479d3ab 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -136,6 +136,43 @@ enum ,NSTAT_SYSINFO_TFO_SYN_DATA_ACKED = 41 ,NSTAT_SYSINFO_TFO_SYN_LOSS = 42 ,NSTAT_SYSINFO_TFO_BLACKHOLE = 43 + ,NSTAT_SYSINFO_ECN_FALLBACK_SYNLOSS = 44 + ,NSTAT_SYSINFO_ECN_FALLBACK_REORDER = 45 + ,NSTAT_SYSINFO_ECN_FALLBACK_CE = 46 + ,NSTAT_SYSINFO_ECN_IFNET_TYPE = 47 + ,NSTAT_SYSINFO_ECN_IFNET_PROTO = 48 + ,NSTAT_SYSINFO_ECN_IFNET_CLIENT_SETUP = 49 + ,NSTAT_SYSINFO_ECN_IFNET_SERVER_SETUP = 50 + ,NSTAT_SYSINFO_ECN_IFNET_CLIENT_SUCCESS = 51 + ,NSTAT_SYSINFO_ECN_IFNET_SERVER_SUCCESS = 52 + ,NSTAT_SYSINFO_ECN_IFNET_PEER_NOSUPPORT = 53 + ,NSTAT_SYSINFO_ECN_IFNET_SYN_LOST = 54 + ,NSTAT_SYSINFO_ECN_IFNET_SYNACK_LOST = 55 + ,NSTAT_SYSINFO_ECN_IFNET_RECV_CE = 56 + ,NSTAT_SYSINFO_ECN_IFNET_RECV_ECE = 57 + ,NSTAT_SYSINFO_ECN_IFNET_SENT_ECE = 58 + ,NSTAT_SYSINFO_ECN_IFNET_CONN_RECV_CE = 59 + ,NSTAT_SYSINFO_ECN_IFNET_CONN_RECV_ECE = 60 + ,NSTAT_SYSINFO_ECN_IFNET_CONN_PLNOCE = 61 + ,NSTAT_SYSINFO_ECN_IFNET_CONN_PLCE = 62 + ,NSTAT_SYSINFO_ECN_IFNET_CONN_NOPLCE = 63 + ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_SYNLOSS = 64 + ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_REORDER = 65 + ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_CE = 66 + ,NSTAT_SYSINFO_ECN_IFNET_ON_RTT_AVG = 67 + ,NSTAT_SYSINFO_ECN_IFNET_ON_RTT_VAR = 68 + ,NSTAT_SYSINFO_ECN_IFNET_ON_OOPERCENT = 69 + ,NSTAT_SYSINFO_ECN_IFNET_ON_SACK_EPISODE = 70 + ,NSTAT_SYSINFO_ECN_IFNET_ON_REORDER_PERCENT = 71 + ,NSTAT_SYSINFO_ECN_IFNET_ON_RXMIT_PERCENT = 72 + ,NSTAT_SYSINFO_ECN_IFNET_ON_RXMIT_DROP = 73 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_RTT_AVG = 74 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_RTT_VAR = 75 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_OOPERCENT = 76 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_SACK_EPISODE = 77 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_REORDER_PERCENT = 78 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_RXMIT_PERCENT = 79 + ,NSTAT_SYSINFO_ECN_IFNET_OFF_RXMIT_DROP = 80 }; #pragma mark -- Network Statistics Providers -- @@ -463,6 +500,7 @@ typedef struct nstat_sysinfo_add_param #define NSTAT_SYSINFO_MBUF_STATS 0x0001 #define NSTAT_SYSINFO_TCP_STATS 0x0002 +#define NSTAT_SYSINFO_IFNET_ECN_STATS 0x0003 #pragma mark -- Network Statistics User Client -- @@ -703,6 +741,9 @@ typedef struct nstat_sysinfo_tcp_stats u_int32_t ecn_conn_plnoce; /* Number of connections using ECN seen packet loss but never received CE */ u_int32_t ecn_conn_pl_ce; /* Number of connections using ECN seen packet loss and CE */ u_int32_t ecn_conn_nopl_ce; /* Number of connections using ECN with no packet loss but received CE */ + u_int32_t ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */ + u_int32_t ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */ + u_int32_t ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */ u_int32_t tfo_syn_data_rcv; /* Number of SYN+data received with valid cookie */ u_int32_t tfo_cookie_req_rcv;/* Number of TFO cookie-requests received */ u_int32_t tfo_cookie_sent; /* Number of TFO-cookies offered to the client */ @@ -715,12 +756,30 @@ typedef struct nstat_sysinfo_tcp_stats u_int32_t tfo_blackhole; /* Number of times SYN+TFO has been lost and we fallback */ } nstat_sysinfo_tcp_stats; +enum { + NSTAT_IFNET_ECN_PROTO_IPV4 = 1 + ,NSTAT_IFNET_ECN_PROTO_IPV6 +}; + +enum { + NSTAT_IFNET_ECN_TYPE_CELLULAR = 1 + ,NSTAT_IFNET_ECN_TYPE_WIFI + ,NSTAT_IFNET_ECN_TYPE_ETHERNET +}; + +typedef struct nstat_sysinfo_ifnet_ecn_stats { + u_int32_t ifnet_proto; + u_int32_t ifnet_type; + struct if_tcp_ecn_stat ecn_stat; +} nstat_sysinfo_ifnet_ecn_stats; + typedef struct nstat_sysinfo_data { u_int32_t flags; union { nstat_sysinfo_mbuf_stats mb_stats; nstat_sysinfo_tcp_stats tcp_stats; + nstat_sysinfo_ifnet_ecn_stats ifnet_ecn_stats; } u; } nstat_sysinfo_data; diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index 9dec8f760..75b52b626 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -604,24 +604,23 @@ pfi_update_status(const char *name, struct pf_status *pfs) if (p == NULL) return; - if (pfs) { + if (pfs != NULL) { bzero(pfs->pcounters, sizeof (pfs->pcounters)); bzero(pfs->bcounters, sizeof (pfs->bcounters)); - } - /* just clear statistics */ - if (pfs == NULL) { + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) { + pfs->pcounters[i][j][k] += + p->pfik_packets[i][j][k]; + pfs->bcounters[i][j] += + p->pfik_bytes[i][j][k]; + } + } else { + /* just clear statistics */ bzero(p->pfik_packets, sizeof (p->pfik_packets)); bzero(p->pfik_bytes, sizeof (p->pfik_bytes)); p->pfik_tzero = pf_calendar_time_second(); } - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - for (k = 0; k < 2; k++) { - pfs->pcounters[i][j][k] += - p->pfik_packets[i][j][k]; - pfs->bcounters[i][j] += - p->pfik_bytes[i][j][k]; - } } int diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index df1130702..0cd7a2287 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -239,8 +239,9 @@ in_gif_input(m, off) struct ifnet *gifp = NULL; struct ip *ip; int af, proto; - u_int8_t otos; + u_int8_t otos, old_tos; int egress_success = 0; + int sum; ip = mtod(m, struct ip *); proto = ip->ip_p; @@ -268,9 +269,17 @@ in_gif_input(m, off) return; } ip = mtod(m, struct ip *); - if (gifp->if_flags & IFF_LINK1) + if (gifp->if_flags & IFF_LINK1) { + old_tos = ip->ip_tos; egress_success = ip_ecn_egress(ECN_NORMAL, &otos, &ip->ip_tos); - else + if (old_tos != ip->ip_tos) { + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + } + } else egress_success = ip_ecn_egress(ECN_NOCARE, &otos, &ip->ip_tos); break; } diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 0cbd238cc..dce4177d4 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -714,11 +714,12 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (EINVAL); if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) wild = 1; - socket_unlock(so, 0); /* keep reference on socket */ - lck_rw_lock_exclusive(pcbinfo->ipi_lock); bzero(&laddr, sizeof(laddr)); + socket_unlock(so, 0); /* keep reference on socket */ + lck_rw_lock_exclusive(pcbinfo->ipi_lock); + if (nam != NULL) { if (nam->sa_len != sizeof (struct sockaddr_in)) { @@ -944,6 +945,17 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } socket_lock(so, 0); + + /* + * We unlocked socket's protocol lock for a long time. + * The socket might have been dropped/defuncted. + * Checking if world has changed since. + */ + if (inp->inp_state == INPCB_STATE_DEAD) { + lck_rw_done(pcbinfo->ipi_lock); + return (ECONNABORTED); + } + if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) { lck_rw_done(pcbinfo->ipi_lock); return (EINVAL); @@ -2039,7 +2051,13 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, } /* - * Insert PCB onto various hash lists. + * @brief Insert PCB onto various hash lists. + * + * @param inp Pointer to internet protocol control block + * @param locked Implies if ipi_lock (protecting pcb list) + * is already locked or not. + * + * @return int error on failure and 0 on success */ int in_pcbinshash(struct inpcb *inp, int locked) @@ -2059,17 +2077,23 @@ in_pcbinshash(struct inpcb *inp, int locked) socket_unlock(inp->inp_socket, 0); lck_rw_lock_exclusive(pcbinfo->ipi_lock); socket_lock(inp->inp_socket, 0); - if (inp->inp_state == INPCB_STATE_DEAD) { - /* - * The socket got dropped when - * it was unlocked - */ - lck_rw_done(pcbinfo->ipi_lock); - return (ECONNABORTED); - } } } + /* + * This routine or its caller may have given up + * socket's protocol lock briefly. + * During that time the socket may have been dropped. + * Safe-guarding against that. + */ + if (inp->inp_state == INPCB_STATE_DEAD) { + if (!locked) { + lck_rw_done(pcbinfo->ipi_lock); + } + return (ECONNABORTED); + } + + #if INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; @@ -2093,8 +2117,6 @@ in_pcbinshash(struct inpcb *inp, int locked) break; } - VERIFY(inp->inp_state != INPCB_STATE_DEAD); - /* * If none exists, malloc one and tack it on. */ diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 7ce89307a..861658176 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -498,6 +498,15 @@ struct xinpgen { #if IPSEC #define in6p_sp inp_sp #endif /* IPSEC */ +#define INP_INC_IFNET_STAT(_inp_, _stat_) { \ + if ((_inp_)->inp_last_outifp != NULL) { \ + if ((_inp_)->inp_vflag & INP_IPV6) { \ + (_inp_)->inp_last_outifp->if_ipv6_stat->_stat_++;\ + } else { \ + (_inp_)->inp_last_outifp->if_ipv4_stat->_stat_++;\ + }\ + }\ +} struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index 3e0facc1c..2a00f895f 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -464,6 +464,13 @@ inpcb_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, if (SOCK_PROTO(inp->inp_socket) == IPPROTO_TCP) { struct tcpcb *tp = sototcpcb(inp->inp_socket); + /* + * Workaround race where inp_ppcb is NULL during + * socket initialization + */ + if (tp == NULL) + continue; + switch (tp->t_state) { case TCPS_CLOSED: continue; diff --git a/bsd/netinet/ip_ecn.c b/bsd/netinet/ip_ecn.c index 38c790026..70ce0cfb8 100644 --- a/bsd/netinet/ip_ecn.c +++ b/bsd/netinet/ip_ecn.c @@ -117,13 +117,9 @@ ip_ecn_egress(mode, outer, inner) /* Process ECN for both normal and compatibility modes */ case ECN_NORMAL: case ECN_COMPATIBILITY: - if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { - if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) { - /* Drop */ - return (0); - } else { - *inner |= IPTOS_ECN_CE; - } + if (((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) && + ((*inner & IPTOS_ECN_MASK) != IPTOS_ECN_NOTECT)) { + *inner |= IPTOS_ECN_CE; } else if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 && (*inner & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0) { *inner = *outer; diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 134a61535..9dcb06620 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -239,10 +239,15 @@ struct tcphdr { * Enable message delivery on a socket, this feature is currently unsupported and * is subjected to change in future. */ -#define TCP_ENABLE_MSGS 0x206 +#define TCP_ENABLE_MSGS 0x206 #define TCP_ADAPTIVE_WRITE_TIMEOUT 0x207 /* Write timeout used as a multiple of RTT */ -#define TCP_NOTIMEWAIT 0x208 /* Avoid going into time-wait */ +#define TCP_NOTIMEWAIT 0x208 /* Avoid going into time-wait */ #define TCP_DISABLE_BLACKHOLE_DETECTION 0x209 /* disable PMTU blackhole detection */ +#define TCP_ECN_MODE 0x210 /* fine grain control for A/B testing */ + +#define ECN_MODE_DEFAULT 0x0 /* per interface or system wide default */ +#define ECN_MODE_ENABLE 0x1 /* force enable ECN on connection */ +#define ECN_MODE_DISABLE 0x2 /* force disable ECN on connection */ /* * The TCP_INFO socket option is a private API and is subject to change @@ -339,6 +344,26 @@ struct tcp_info { tcpi_tfo_cookie_req_rcv:1, /* Server received cookie-request */ tcpi_tfo_cookie_sent:1, /* Server announced cookie */ tcpi_tfo_cookie_invalid:1; /* Server received an invalid cookie */ + + u_int16_t tcpi_ecn_client_setup:1, /* Attempted ECN setup from client side */ + tcpi_ecn_server_setup:1, /* Attempted ECN setup from server side */ + tcpi_ecn_success:1, /* peer negotiated ECN */ + tcpi_ecn_lost_syn:1, /* Lost SYN with ECN setup */ + tcpi_ecn_lost_synack:1, /* Lost SYN-ACK with ECN setup */ + tcpi_local_peer:1, /* Local to the host or the subnet */ + tcpi_if_cell:1, /* Interface is cellular */ + tcpi_if_wifi:1; /* Interface is WiFi */ + + u_int32_t tcpi_ecn_recv_ce; /* Packets received with CE */ + u_int32_t tcpi_ecn_recv_cwr; /* Packets received with CWR */ + + u_int32_t tcpi_rcvoopack; /* out-of-order packets received */ + u_int32_t tcpi_pawsdrop; /* segments dropped due to PAWS */ + u_int32_t tcpi_sack_recovery_episode; /* SACK recovery episodes */ + u_int32_t tcpi_reordered_pkts; /* packets reorderd */ + u_int32_t tcpi_dsack_sent; /* Sent DSACK notification */ + u_int32_t tcpi_dsack_recvd; /* Received a valid DSACK option */ + u_int32_t tcpi_flowhash; /* Unique id for the connection */ }; struct tcp_measure_bw_burst { diff --git a/bsd/netinet/tcp_cache.c b/bsd/netinet/tcp_cache.c index cb3b86d04..b872c7d32 100644 --- a/bsd/netinet/tcp_cache.c +++ b/bsd/netinet/tcp_cache.c @@ -56,18 +56,21 @@ struct tcp_heuristic { struct tcp_heuristic_key th_key; - /* - * If tfo_cookie_loss is changed to a smaller type, it might be worth - * checking for integer-overflow in tcp_cache_tfo_inc_loss - */ - u_int32_t th_tfo_cookie_loss; /* The number of times a SYN+cookie has been lost */ + char th_val_start[0]; /* Marker for memsetting to 0 */ + + u_int8_t th_tfo_cookie_loss; /* The number of times a SYN+cookie has been lost */ + u_int8_t th_ecn_loss; /* The number of times a SYN+ecn has been lost */ + u_int8_t th_ecn_aggressive; /* The number of times we did an aggressive fallback */ u_int32_t th_tfo_fallback_trials; /* Number of times we did not try out TFO due to SYN-loss */ u_int32_t th_tfo_cookie_backoff; /* Time until when we should not try out TFO */ + u_int32_t th_ecn_backoff; /* Time until when we should not try out ECN */ - u_int8_t th_tfo_in_backoff:1, /* Are we doing TFO due to the backoff timer? */ - th_tfo_aggressive_fallback:1, /* Agressive fallback due to nasty middlebox */ + u_int8_t th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */ + th_tfo_aggressive_fallback:1, /* Aggressive fallback due to nasty middlebox */ th_tfo_snd_middlebox_supp:1, /* We are sure that the network supports TFO in upstream direction */ th_tfo_rcv_middlebox_supp:1; /* We are sure that the network supports TFO in downstream direction*/ + + char th_val_end[0]; /* Marker for memsetting to 0 */ }; struct tcp_heuristics_head { @@ -131,8 +134,9 @@ static lck_attr_t *tcp_heuristic_mtx_attr; static lck_grp_t *tcp_heuristic_mtx_grp; static lck_grp_attr_t *tcp_heuristic_mtx_grp_attr; -/* Number of SYN-losses we accept */ -#define TFO_MAX_COOKIE_LOSS 2 +int tcp_ecn_timeout = 60; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_ecn_timeout, 0, "Initial minutes to wait before re-trying ECN"); /* * Round up to next higher power-of 2. See "Bit Twiddling Hacks". @@ -446,13 +450,8 @@ static struct tcp_heuristic *tcp_getheuristic_with_lock(struct tcpcb *tp, tpheur = oldest_heur; /* We recycle - set everything to 0 */ - tpheur->th_tfo_cookie_loss = 0; - tpheur->th_tfo_fallback_trials = 0; - tpheur->th_tfo_cookie_backoff = 0; - tpheur->th_tfo_in_backoff = 0; - tpheur->th_tfo_aggressive_fallback = 0; - tpheur->th_tfo_snd_middlebox_supp = 0; - tpheur->th_tfo_rcv_middlebox_supp = 0; + bzero(tpheur->th_val_start, + tpheur->th_val_end - tpheur->th_val_start); } else { /* Create a new heuristic and add it to the list */ tpheur = _MALLOC(sizeof(struct tcp_heuristic), M_TEMP, @@ -463,6 +462,13 @@ static struct tcp_heuristic *tcp_getheuristic_with_lock(struct tcpcb *tp, SLIST_INSERT_HEAD(&head->tcp_heuristics, tpheur, list); } + /* + * Set to tcp_now, to make sure it won't be > than tcp_now in the + * near future. + */ + tpheur->th_ecn_backoff = tcp_now; + tpheur->th_tfo_cookie_backoff = tcp_now; + memcpy(&tpheur->th_key, &key, sizeof(key)); } @@ -523,7 +529,7 @@ void tcp_heuristic_tfo_snd_good(struct tcpcb *tp) tp->t_tfo_flags |= TFO_F_NO_SNDPROBING; } -void tcp_heuristic_tfo_inc_loss(struct tcpcb *tp) +void tcp_heuristic_inc_loss(struct tcpcb *tp, int tfo, int ecn) { struct tcp_heuristics_head *head; struct tcp_heuristic *tpheur; @@ -532,8 +538,20 @@ void tcp_heuristic_tfo_inc_loss(struct tcpcb *tp) if (tpheur == NULL) return; - /* Potential integer overflow, but tfo_cookie_loss is 32-bits */ - tpheur->th_tfo_cookie_loss++; + /* Limit to 9 to prevent integer-overflow during exponential backoff */ + if (tfo && tpheur->th_tfo_cookie_loss < 9) + tpheur->th_tfo_cookie_loss++; + + if (ecn && tpheur->th_ecn_loss < 9) { + tpheur->th_ecn_loss++; + if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { + tcpstat.tcps_ecn_fallback_synloss++; + INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_synloss); + tpheur->th_ecn_backoff = tcp_now + + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) + << (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); + } + } tcp_heuristic_unlock(head); } @@ -552,7 +570,30 @@ void tcp_heuristic_tfo_middlebox(struct tcpcb *tp) tcp_heuristic_unlock(head); } -void tcp_heuristic_tfo_reset_loss(struct tcpcb *tp) +void tcp_heuristic_ecn_aggressive(struct tcpcb *tp) +{ + struct tcp_heuristics_head *head; + struct tcp_heuristic *tpheur; + + tpheur = tcp_getheuristic_with_lock(tp, 1, &head); + if (tpheur == NULL) + return; + + /* Must be done before, otherwise we will start off with expo-backoff */ + tpheur->th_ecn_backoff = tcp_now + + ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << (tpheur->th_ecn_aggressive)); + + /* + * Ugly way to prevent integer overflow... limit to 9 to prevent in + * overflow during exp. backoff. + */ + if (tpheur->th_ecn_aggressive < 9) + tpheur->th_ecn_aggressive++; + + tcp_heuristic_unlock(head); +} + +void tcp_heuristic_reset_loss(struct tcpcb *tp, int tfo, int ecn) { struct tcp_heuristics_head *head; struct tcp_heuristic *tpheur; @@ -566,8 +607,11 @@ void tcp_heuristic_tfo_reset_loss(struct tcpcb *tp) if (tpheur == NULL) return; - tpheur->th_tfo_cookie_loss = 0; - tpheur->th_tfo_aggressive_fallback = 0; + if (tfo) + tpheur->th_tfo_cookie_loss = 0; + + if (ecn) + tpheur->th_ecn_loss = 0; tcp_heuristic_unlock(head); } @@ -634,6 +678,25 @@ boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp) return (true); } +boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp) +{ + struct tcp_heuristics_head *head; + struct tcp_heuristic *tpheur; + boolean_t ret = true; + + /* Get the tcp-heuristic. */ + tpheur = tcp_getheuristic_with_lock(tp, 0, &head); + if (tpheur == NULL) + return ret; + + if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) + ret = false; + + tcp_heuristic_unlock(head); + + return (ret); +} + static void sysctl_cleartfocache(void) { int i; diff --git a/bsd/netinet/tcp_cache.h b/bsd/netinet/tcp_cache.h index 601aec807..4408fd5ff 100644 --- a/bsd/netinet/tcp_cache.h +++ b/bsd/netinet/tcp_cache.h @@ -34,17 +34,26 @@ #include #include +/* Number of SYN-losses we accept */ +#define TFO_MAX_COOKIE_LOSS 2 +#define ECN_MAX_SYN_LOSS 2 + +#define ECN_MIN_CE_PROBES 10 /* Probes are basically the number of incoming packets */ +#define ECN_MAX_CE_RATIO 7 /* Ratio is the maximum number of CE-packets we accept per incoming "probe" */ + extern void tcp_cache_set_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t len); extern int tcp_cache_get_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t *len); extern unsigned int tcp_cache_get_cookie_len(struct tcpcb *tp); -extern void tcp_heuristic_tfo_inc_loss(struct tcpcb *tp); +extern void tcp_heuristic_inc_loss(struct tcpcb *tp, int tfo, int ecn); extern void tcp_heuristic_tfo_snd_good(struct tcpcb *tp); extern void tcp_heuristic_tfo_rcv_good(struct tcpcb *tp); extern void tcp_heuristic_tfo_middlebox(struct tcpcb *tp); -extern void tcp_heuristic_tfo_reset_loss(struct tcpcb *tp); +extern void tcp_heuristic_ecn_aggressive(struct tcpcb *tp); +extern void tcp_heuristic_reset_loss(struct tcpcb *tp, int tfo, int ecn); extern void tcp_heuristic_tfo_success(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp); +extern boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp); extern void tcp_cache_init(void); diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 8f2a92cc8..1d65c4355 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -707,6 +707,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, th->th_seq += i; } } + tp->t_rcvoopack++; tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; if (nstat_collect) { @@ -1649,7 +1650,7 @@ tcp_tfo_synack(tp, to) /* * If this happens, things have gone terribly wrong. len should - * have been check in tcp_dooptions. + * have been checked in tcp_dooptions. */ VERIFY(len <= TFO_COOKIE_LEN_MAX); @@ -1668,9 +1669,9 @@ tcp_tfo_synack(tp, to) * backing of TFO-cookie requests. */ if (tp->t_tfo_flags & TFO_F_SYN_LOSS) - tcp_heuristic_tfo_inc_loss(tp); + tcp_heuristic_inc_loss(tp, 1, 0); else - tcp_heuristic_tfo_reset_loss(tp); + tcp_heuristic_reset_loss(tp, 1, 0); } } @@ -2546,18 +2547,21 @@ tcp_input(m, off0) TCP_ECN_ENABLED(tp) && tlen > 0 && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp->t_ecn_recv_ce++; tcpstat.tcps_ecn_recv_ce++; + INP_INC_IFNET_STAT(inp, ecn_recv_ce); /* Mark this connection as it received CE from network */ tp->ecn_flags |= TE_RECV_ECN_CE; tp->ecn_flags |= TE_SENDECE; } - + /* * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't * bother doing extensive checks for state and whatnot. */ if (thflags & TH_CWR) { tp->ecn_flags &= ~TE_SENDECE; + tp->t_ecn_recv_cwr++; } /* @@ -2573,6 +2577,30 @@ tcp_input(m, off0) CLEAR_IAJ_STATE(tp); } + if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED && + !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) { + tcpstat.tcps_ecn_fallback_ce++; + tcp_heuristic_ecn_aggressive(tp); + tp->ecn_flags |= TE_CEHEURI_SET; + } + + if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) && + ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) { + if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) { + tp->t_ecn_recv_ce_pkt++; + } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) { + tcpstat.tcps_ecn_fallback_ce++; + tcp_heuristic_ecn_aggressive(tp); + tp->ecn_flags |= TE_CEHEURI_SET; + INP_INC_IFNET_STAT(inp,ecn_fallback_ce); + } else { + /* We tracked the first ECN_MIN_CE_PROBES segments, we + * now know that the path is good. + */ + tp->ecn_flags |= TE_CEHEURI_SET; + } + } + /* * Try to determine if we are receiving a packet after a long time. * Use our own approximation of idletime to roughly measure remote @@ -2711,7 +2739,7 @@ tcp_input(m, off0) * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && - (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && @@ -2731,11 +2759,6 @@ tcp_input(m, off0) tp->ts_recent = to.to_tsval; } - /* Force acknowledgment if we received a FIN */ - - if (thflags & TH_FIN) - tp->t_flags |= TF_ACKNOW; - if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && @@ -3189,12 +3212,20 @@ tcp_input(m, off0) if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) { /* ECN-setup SYN-ACK */ tp->ecn_flags |= TE_SETUPRECEIVED; - if (TCP_ECN_ENABLED(tp)) + if (TCP_ECN_ENABLED(tp)) { + tcp_heuristic_reset_loss(tp, 0, 1); tcpstat.tcps_ecn_client_success++; + } } else { if (tp->ecn_flags & TE_SETUPSENT && - tp->t_rxtshift == 0) + tp->t_rxtshift == 0) { + tcp_heuristic_reset_loss(tp, 0, 1); tcpstat.tcps_ecn_not_supported++; + } + if (tp->ecn_flags & TE_SETUPSENT && + tp->t_rxtshift > 0) + tcp_heuristic_inc_loss(tp, 0, 1); + /* non-ECN-setup SYN-ACK */ tp->ecn_flags &= ~TE_SENDIPECT; } @@ -3506,7 +3537,24 @@ tcp_input(m, off0) } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; + tp->t_pawsdrop++; tcpstat.tcps_pawsdrop++; + + /* + * PAWS-drop when ECN is being used? That indicates + * that ECT-marked packets take a different path, with + * different congestion-characteristics. + * + * Only fallback when we did send less than 2GB as PAWS + * really has no reason to kick in earlier. + */ + if (TCP_ECN_ENABLED(tp) && + inp->inp_stat->rxbytes < 2147483648) { + INP_INC_IFNET_STAT(inp, ecn_fallback_reorder); + tcpstat.tcps_ecn_fallback_reorder++; + tcp_heuristic_ecn_aggressive(tp); + } + if (nstat_collect) { nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE); @@ -4139,6 +4187,7 @@ tcp_input(m, off0) if (SACK_ENABLED(tp)) { tcpstat.tcps_sack_recovery_episode++; + tp->t_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; tp->t_flagsext &= @@ -4331,6 +4380,7 @@ tcp_input(m, off0) * ECE atleast once */ tp->ecn_flags |= TE_RECV_ECN_ECE; + INP_INC_IFNET_STAT(inp, ecn_recv_ece); tcpstat.tcps_ecn_recv_ece++; tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD); } @@ -4736,8 +4786,7 @@ tcp_input(m, off0) tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - } - else { + } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index e348fadde..86d4a71f3 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -154,15 +154,75 @@ int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +static int +sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i, err = 0, changed = 0; + struct ifnet *ifp; + + err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t), + &i, &changed); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return(err); + + if (changed) { + if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) && + (i == 0 || i == 1)) { + tcp_ecn_outbound = i; + return(err); + } + if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) { + /* + * Reset ECN enable flags on non-cellular + * interfaces so that the system default will take + * over + */ + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (!IFNET_IS_CELLULAR(ifp)) { + ifnet_lock_exclusive(ifp); + ifp->if_eflags &= ~IFEF_ECN_DISABLE; + ifp->if_eflags &= ~IFEF_ECN_ENABLE; + ifnet_lock_done(ifp); + } + } + ifnet_head_done(); + } else { + /* + * Set ECN enable flags on non-cellular + * interfaces + */ + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (!IFNET_IS_CELLULAR(ifp)) { + ifnet_lock_exclusive(ifp); + ifp->if_eflags |= IFEF_ECN_ENABLE; + ifp->if_eflags &= ~IFEF_ECN_DISABLE; + ifnet_lock_done(ifp); + } + } + ifnet_head_done(); + } + tcp_ecn_outbound = i; + } + /* Change the other one too as the work is done */ + if (i == 2 || tcp_ecn_inbound == 2) + tcp_ecn_inbound = i; + return (err); +} + int tcp_ecn_outbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, - "Initiate ECN for outbound connections"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, + sysctl_change_ecn_setting, "IU", + "Initiate ECN for outbound connections"); int tcp_ecn_inbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, - "Allow ECN negotiation for inbound connections"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, + sysctl_change_ecn_setting, "IU", + "Initiate ECN for inbound connections"); int tcp_packet_chaining = 50; SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, @@ -379,6 +439,56 @@ tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so) (tp->t_flagsext & TF_FASTOPEN))); } +void +tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp) +{ + boolean_t inbound; + + /* + * Socket option has precedence + */ + if (tp->ecn_flags & TE_ECN_MODE_ENABLE) { + tp->ecn_flags |= TE_ENABLE_ECN; + goto check_heuristic; + } + + if (tp->ecn_flags & TE_ECN_MODE_DISABLE) { + tp->ecn_flags &= ~TE_ENABLE_ECN; + return; + } + /* + * Per interface setting comes next + */ + if (ifp != NULL) { + if (ifp->if_eflags & IFEF_ECN_ENABLE) { + tp->ecn_flags |= TE_ENABLE_ECN; + goto check_heuristic; + } + + if (ifp->if_eflags & IFEF_ECN_DISABLE) { + tp->ecn_flags &= ~TE_ENABLE_ECN; + return; + } + } + /* + * System wide settings come last + */ + inbound = (tp->t_inpcb->inp_socket->so_head != NULL); + if ((inbound && tcp_ecn_inbound == 1) || + (!inbound && tcp_ecn_outbound == 1)) { + tp->ecn_flags |= TE_ENABLE_ECN; + goto check_heuristic; + } else { + tp->ecn_flags &= ~TE_ENABLE_ECN; + } + + return; + +check_heuristic: + if (!tcp_heuristic_do_ecn(tp)) + tp->ecn_flags &= ~TE_ENABLE_ECN; +} + /* * Tcp output routine: figure out what should be sent and send it. * @@ -609,8 +719,8 @@ tcp_output(struct tcpcb *tp) if ((ifp = rt->rt_ifp) != NULL) { somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES)); tcp_set_tso(tp, ifp); - soif2kcl(so, - (ifp->if_eflags & IFEF_2KCL)); + soif2kcl(so, (ifp->if_eflags & IFEF_2KCL)); + tcp_set_ecn(tp, ifp); } if (rt->rt_flags & RTF_UP) RT_GENID_SYNC(rt); @@ -1493,6 +1603,7 @@ tcp_output(struct tcpcb *tp) *lp++ = htonl(tp->t_dsack_lseq); *lp++ = htonl(tp->t_dsack_rseq); tcpstat.tcps_dsack_sent++; + tp->t_dsack_sent++; nsack--; } VERIFY(nsack == 0 || tp->rcv_numsacks >= nsack); @@ -1533,8 +1644,8 @@ tcp_output(struct tcpcb *tp) * * For a SYN-ACK, send an ECN setup SYN-ACK */ - if ((tcp_ecn_inbound || (tp->t_flags & TF_ENABLE_ECN)) - && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { + if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && + (tp->ecn_flags & TE_ENABLE_ECN)) { if (tp->ecn_flags & TE_SETUPRECEIVED) { if (tcp_send_ecn_flags_on_syn(tp, so)) { /* @@ -1568,6 +1679,7 @@ tcp_output(struct tcpcb *tp) if (tp->ecn_flags & TE_SETUPSENT) { tcpstat.tcps_ecn_lost_synack++; tcpstat.tcps_ecn_server_success--; + tp->ecn_flags |= TE_LOST_SYNACK; } tp->ecn_flags &= @@ -1575,8 +1687,8 @@ tcp_output(struct tcpcb *tp) TE_SENDCWR); } } - } else if ((tcp_ecn_outbound || (tp->t_flags & TF_ENABLE_ECN)) - && (flags & (TH_SYN | TH_ACK)) == TH_SYN) { + } else if ((flags & (TH_SYN | TH_ACK)) == TH_SYN && + (tp->ecn_flags & TE_ENABLE_ECN)) { if (tcp_send_ecn_flags_on_syn(tp, so)) { /* * Setting TH_ECE and TH_CWR makes this an @@ -1584,6 +1696,7 @@ tcp_output(struct tcpcb *tp) */ flags |= (TH_ECE | TH_CWR); tcpstat.tcps_ecn_client_setup++; + tp->ecn_flags |= TE_CLIENT_SETUP; /* * Record that we sent the ECN-setup and default to @@ -1596,8 +1709,10 @@ tcp_output(struct tcpcb *tp) * Fall back to non-ECN and clear flag indicating * we should send data with IP ECT set. */ - if (tp->ecn_flags & TE_SETUPSENT) + if (tp->ecn_flags & TE_SETUPSENT) { tcpstat.tcps_ecn_lost_syn++; + tp->ecn_flags |= TE_LOST_SYN; + } tp->ecn_flags &= ~TE_SENDIPECT; } } @@ -1971,7 +2086,9 @@ tcp_output(struct tcpcb *tp) tcp_rxtseg_insert(tp, tp->snd_nxt, (tp->snd_nxt + len - 1)); } - m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT; + if (len > 0) + m->m_pkthdr.pkt_flags |= + PKTF_TCP_REXMT; } } else { th->th_seq = htonl(tp->snd_max); @@ -1981,7 +2098,8 @@ tcp_output(struct tcpcb *tp) tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1)); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; - m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT; + if (len > 0) + m->m_pkthdr.pkt_flags |= PKTF_TCP_REXMT; } th->th_ack = htonl(tp->rcv_nxt); tp->last_ack_sent = tp->rcv_nxt; diff --git a/bsd/netinet/tcp_sack.c b/bsd/netinet/tcp_sack.c index 7d8b715ed..5d0bf9130 100644 --- a/bsd/netinet/tcp_sack.c +++ b/bsd/netinet/tcp_sack.c @@ -377,6 +377,7 @@ tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, } tcpstat.tcps_reordered_pkts++; + tp->t_reordered_pkts++; VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); @@ -890,6 +891,7 @@ tcp_sack_process_dsack(struct tcpcb *tp, struct tcpopt *to, to->to_nsacks--; to->to_sacks += TCPOLEN_SACK; tcpstat.tcps_dsack_recvd++; + tp->t_dsack_recvd++; /* ignore DSACK option, if DSACK is disabled */ if (tp->t_flagsext & TF_DISABLE_DSACK) diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 6fafa0f5f..65a171fed 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1134,6 +1134,89 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) } } +static inline void +tcp_update_ecn_perf_stats(struct tcpcb *tp, + struct if_tcp_ecn_perf_stat *stat) +{ + u_int64_t curval, oldval; + struct inpcb *inp = tp->t_inpcb; + + /* Average RTT */ + curval = (tp->t_srtt >> TCP_RTT_SHIFT); + if (curval > 0 && tp->t_rttupdated >= 16) { + if (stat->rtt_avg == 0) { + stat->rtt_avg = curval; + } else { + oldval = stat->rtt_avg; + stat->rtt_avg = + ((oldval << 4) - oldval + curval) >> 4; + } + } + + /* RTT variance */ + curval = tp->t_rttvar >> TCP_RTTVAR_SHIFT; + if (curval > 0 && tp->t_rttupdated >= 16) { + if (stat->rtt_var == 0) { + stat->rtt_var = curval; + } else { + oldval = stat->rtt_var; + stat->rtt_var = + ((oldval << 4) - oldval + curval) >> 4; + } + } + + /* Percentage of Out-of-order packets, shift by 10 for precision */ + curval = (tp->t_rcvoopack << 10); + if (inp->inp_stat != NULL && inp->inp_stat->rxpackets > 0 && + curval > 0) { + /* Compute percentage */ + curval = (curval * 100)/inp->inp_stat->rxpackets; + if (stat->oo_percent == 0) { + stat->oo_percent = curval; + } else { + oldval = stat->oo_percent; + stat->oo_percent = + ((oldval << 4) - oldval + curval) >> 4; + } + } + + /* Total number of SACK recovery episodes */ + stat->sack_episodes += tp->t_sack_recovery_episode; + + /* Percentage of reordered packets, shift by 10 for precision */ + curval = tp->t_reordered_pkts + tp->t_pawsdrop + tp->t_dsack_sent + + tp->t_dsack_recvd; + curval = curval << 10; + if (inp->inp_stat != NULL && (inp->inp_stat->rxpackets > 0 || + inp->inp_stat->txpackets > 0) && curval > 0) { + /* Compute percentage */ + curval = (curval * 100) / + (inp->inp_stat->rxpackets + inp->inp_stat->txpackets); + if (stat->reorder_percent == 0) { + stat->reorder_percent = curval; + } else { + oldval = stat->reorder_percent; + stat->reorder_percent = + ((oldval << 4) - oldval + curval) >> 4; + } + } + + /* Percentage of retransmit bytes, shift by 10 for precision */ + curval = tp->t_stat.txretransmitbytes << 10; + if (inp->inp_stat != NULL && inp->inp_stat->txbytes > 0 + && curval > 0) { + curval = (curval * 100) / inp->inp_stat->txbytes; + if (stat->rxmit_percent == 0) { + stat->rxmit_percent = curval; + } else { + oldval = stat->rxmit_percent; + stat->rxmit_percent = + ((oldval << 4) - oldval + curval) >> 4; + } + } + return; +} + /* * Close a TCP control block: * discard all space held by the tcp @@ -1316,22 +1399,95 @@ tcp_close(tp) /* free the reassembly queue, if any */ (void) tcp_freeq(tp); + + /* Collect ECN related statistics */ + if (tp->ecn_flags & TE_SETUPSENT) { + if (tp->ecn_flags & TE_CLIENT_SETUP) { + INP_INC_IFNET_STAT(inp, ecn_client_setup); + if (TCP_ECN_ENABLED(tp)) { + INP_INC_IFNET_STAT(inp, + ecn_client_success); + } else if (tp->ecn_flags & TE_LOST_SYN) { + INP_INC_IFNET_STAT(inp, ecn_syn_lost); + } else { + INP_INC_IFNET_STAT(inp, + ecn_peer_nosupport); + } + } else { + INP_INC_IFNET_STAT(inp, ecn_server_setup); + if (TCP_ECN_ENABLED(tp)) { + INP_INC_IFNET_STAT(inp, + ecn_server_success); + } else if (tp->ecn_flags & TE_LOST_SYNACK) { + INP_INC_IFNET_STAT(inp, + ecn_synack_lost); + } else { + INP_INC_IFNET_STAT(inp, + ecn_peer_nosupport); + } + } + } if (TCP_ECN_ENABLED(tp)) { - if (tp->ecn_flags & TE_RECV_ECN_CE) + if (tp->ecn_flags & TE_RECV_ECN_CE) { tcpstat.tcps_ecn_conn_recv_ce++; - if (tp->ecn_flags & TE_RECV_ECN_ECE) + INP_INC_IFNET_STAT(inp, ecn_conn_recv_ce); + } + if (tp->ecn_flags & TE_RECV_ECN_ECE) { tcpstat.tcps_ecn_conn_recv_ece++; + INP_INC_IFNET_STAT(inp, ecn_conn_recv_ece); + } if (tp->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) { if (tp->t_stat.txretransmitbytes > 0 || - tp->t_stat.rxoutoforderbytes > 0) + tp->t_stat.rxoutoforderbytes > 0) { tcpstat.tcps_ecn_conn_pl_ce++; - else + INP_INC_IFNET_STAT(inp, ecn_conn_plce); + } else { tcpstat.tcps_ecn_conn_nopl_ce++; + INP_INC_IFNET_STAT(inp, ecn_conn_noplce); + } } else { if (tp->t_stat.txretransmitbytes > 0 || - tp->t_stat.rxoutoforderbytes > 0) + tp->t_stat.rxoutoforderbytes > 0) { tcpstat.tcps_ecn_conn_plnoce++; + INP_INC_IFNET_STAT(inp, ecn_conn_plnoce); + } + } + + } + + /* Aggregate performance stats */ + if (inp->inp_last_outifp != NULL) { + struct ifnet *ifp = inp->inp_last_outifp; + ifnet_lock_shared(ifp); + if ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) == + IFRF_ATTACHED) { + if (inp->inp_vflag & INP_IPV6) { + if (TCP_ECN_ENABLED(tp)) { + ifp->if_ipv6_stat->timestamp + = net_uptime(); + tcp_update_ecn_perf_stats(tp, + &ifp->if_ipv6_stat->ecn_on); + } else { + ifp->if_ipv6_stat->timestamp + = net_uptime(); + tcp_update_ecn_perf_stats(tp, + &ifp->if_ipv6_stat->ecn_off); + } + } else { + if (TCP_ECN_ENABLED(tp)) { + ifp->if_ipv4_stat->timestamp + = net_uptime(); + tcp_update_ecn_perf_stats(tp, + &ifp->if_ipv4_stat->ecn_on); + } else { + ifp->if_ipv4_stat->timestamp + = net_uptime(); + tcp_update_ecn_perf_stats(tp, + &ifp->if_ipv4_stat->ecn_off); + } + } } + ifnet_lock_done(ifp); } tcp_free_sackholes(tp); @@ -2451,6 +2607,7 @@ tcp_rtlookup(inp, input_ifscope) tcp_set_tso(tp, rt->rt_ifp); soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); + tcp_set_ecn(tp, rt->rt_ifp); } /* Note if the peer is local */ @@ -2557,6 +2714,7 @@ tcp_rtlookup6(inp, input_ifscope) tcp_set_tso(tp, rt->rt_ifp); soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); + tcp_set_ecn(tp, rt->rt_ifp); } /* Note if the peer is local */ diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index 046163f7b..0ffb340d0 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -327,6 +327,9 @@ struct tcp_last_report_stats { u_int32_t tcps_ecn_conn_plnoce; u_int32_t tcps_ecn_conn_pl_ce; u_int32_t tcps_ecn_conn_nopl_ce; + u_int32_t tcps_ecn_fallback_synloss; + u_int32_t tcps_ecn_fallback_reorder; + u_int32_t tcps_ecn_fallback_ce; /* TFO-related statistics */ u_int32_t tcps_tfo_syn_data_rcv; @@ -779,6 +782,15 @@ tcp_timers(tp, timer) } else { tcpstat.tcps_timeoutdrop++; } + if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) { + if (TCP_ECN_ENABLED(tp)) { + INP_INC_IFNET_STAT(tp->t_inpcb, + ecn_on.rxmit_drop); + } else { + INP_INC_IFNET_STAT(tp->t_inpcb, + ecn_off.rxmit_drop); + } + } tp->t_rxtshift = TCP_MAXRXTSHIFT; postevent(so, 0, EV_TIMEOUT); soevent(so, @@ -1341,6 +1353,7 @@ tcp_timers(tp, timer) tp->t_timer[TCPT_REXMT] = 0; tcpstat.tcps_sack_recovery_episode++; + tp->t_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY); @@ -2046,6 +2059,12 @@ tcp_report_stats(void) &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce); tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce, &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce); + tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss, + &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss); + tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder, + &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder); + tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce, + &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce); tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv, &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv); tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv, diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index 96b17ba5a..bfc86e994 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -671,6 +671,9 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) } else { error = ENETDOWN; } + + /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */ + so->so_flags1 &= ~SOF1_PRECONNECT_DATA; return error; } #endif /* FLOW_DIVERT */ @@ -1558,7 +1561,8 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; - + ti->tcpi_flowhash = inp->inp_flowhash; + if (tp->t_state > TCPS_LISTEN) { if (TSTMP_SUPPORTED(tp)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; @@ -1569,6 +1573,8 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } + if (TCP_ECN_ENABLED(tp)) + ti->tcpi_options |= TCPI_OPT_ECN; /* Are we in retranmission episode */ if (IN_FASTRECOVERY(tp) || tp->t_rxtshift > 0) @@ -1643,6 +1649,31 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_tfo_syn_data_sent = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT); ti->tcpi_tfo_syn_data_acked = !!(tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED); ti->tcpi_tfo_syn_loss = !!(tp->t_tfo_stats & TFO_S_SYN_LOSS); + + ti->tcpi_ecn_client_setup = !!(tp->ecn_flags & TE_SETUPSENT); + ti->tcpi_ecn_server_setup = !!(tp->ecn_flags & TE_SETUPRECEIVED); + ti->tcpi_ecn_success = (tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON ? 1 : 0; + ti->tcpi_ecn_lost_syn = !!(tp->ecn_flags & TE_LOST_SYN); + ti->tcpi_ecn_lost_synack = !!(tp->ecn_flags & TE_LOST_SYNACK); + + ti->tcpi_local_peer = !!(tp->t_flags & TF_LOCAL); + + if (tp->t_inpcb->inp_last_outifp != NULL) { + if (IFNET_IS_CELLULAR(tp->t_inpcb->inp_last_outifp)) + ti->tcpi_if_cell = 1; + else if (IFNET_IS_WIFI(tp->t_inpcb->inp_last_outifp)) + ti->tcpi_if_wifi = 1; + } + + ti->tcpi_ecn_recv_ce = tp->t_ecn_recv_ce; + ti->tcpi_ecn_recv_cwr = tp->t_ecn_recv_cwr; + + ti->tcpi_rcvoopack = tp->t_rcvoopack; + ti->tcpi_pawsdrop = tp->t_pawsdrop; + ti->tcpi_sack_recovery_episode = tp->t_sack_recovery_episode; + ti->tcpi_reordered_pkts = tp->t_reordered_pkts; + ti->tcpi_dsack_sent = tp->t_dsack_sent; + ti->tcpi_dsack_recvd = tp->t_dsack_recvd; } } @@ -1913,7 +1944,6 @@ tcp_ctloutput(so, sopt) case TCP_NODELAY: case TCP_NOOPT: case TCP_NOPUSH: - case TCP_ENABLE_ECN: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1929,9 +1959,6 @@ tcp_ctloutput(so, sopt) case TCP_NOPUSH: opt = TF_NOPUSH; break; - case TCP_ENABLE_ECN: - opt = TF_ENABLE_ECN; - break; default: opt = 0; /* dead code to fool gcc */ break; @@ -2260,6 +2287,36 @@ tcp_ctloutput(so, sopt) else tcp_disable_tfo(tp); break; + case TCP_ENABLE_ECN: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) { + tp->ecn_flags |= TE_ECN_MODE_ENABLE; + tp->ecn_flags &= ~TE_ECN_MODE_DISABLE; + } else { + tp->ecn_flags &= ~TE_ECN_MODE_ENABLE; + } + break; + case TCP_ECN_MODE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval == ECN_MODE_DEFAULT) { + tp->ecn_flags &= ~TE_ECN_MODE_ENABLE; + tp->ecn_flags &= ~TE_ECN_MODE_DISABLE; + } else if (optval == ECN_MODE_ENABLE) { + tp->ecn_flags |= TE_ECN_MODE_ENABLE; + tp->ecn_flags &= ~TE_ECN_MODE_DISABLE; + } else if (optval == ECN_MODE_DISABLE) { + tp->ecn_flags &= ~TE_ECN_MODE_ENABLE; + tp->ecn_flags |= TE_ECN_MODE_DISABLE; + } else { + error = EINVAL; + } + break; case SO_FLUSH: if ((error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval))) != 0) @@ -2312,7 +2369,15 @@ tcp_ctloutput(so, sopt) optval = tp->t_flags & TF_NOPUSH; break; case TCP_ENABLE_ECN: - optval = (tp->t_flags & TF_ENABLE_ECN) ? 1 : 0; + optval = (tp->ecn_flags & TE_ECN_MODE_ENABLE) ? 1 : 0; + break; + case TCP_ECN_MODE: + if (tp->ecn_flags & TE_ECN_MODE_ENABLE) + optval = ECN_MODE_ENABLE; + else if (tp->ecn_flags & TE_ECN_MODE_DISABLE) + optval = ECN_MODE_DISABLE; + else + optval = ECN_MODE_DEFAULT; break; case TCP_CONNECTIONTIMEOUT: optval = tp->t_keepinit / TCP_RETRANSHZ; diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 1ec0559ee..26f5b49d0 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -280,7 +280,6 @@ struct tcpcb { #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x800000 /* require MD5 digests (RFC2385) */ #define TF_MAXSEGSNT 0x1000000 /* last segment sent was a full segment */ -#define TF_ENABLE_ECN 0x2000000 /* Enable ECN */ #define TF_PMTUD 0x4000000 /* Perform Path MTU Discovery for this connection */ #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ @@ -382,16 +381,29 @@ struct tcpcb { u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ u_int32_t rcv_nostrack_ts; /* timestamp when stretch ack was disabled automatically */ u_int16_t rcv_waitforss; /* wait for packets during slow-start */ - u_int16_t ecn_flags; -#define TE_SETUPSENT 0x01 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ -#define TE_SETUPRECEIVED 0x02 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ -#define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ -#define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ -#define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ -#define TE_INRECOVERY 0x20 /* connection entered recovery after receiving ECE */ -#define TE_RECV_ECN_CE 0x40 /* Received IPTOS_ECN_CE marking atleast once */ -#define TE_RECV_ECN_ECE 0x80 /* Received ECE marking atleast once */ -#define TE_ECN_ON (TE_SETUPSENT | TE_SETUPRECEIVED) /* Indicate ECN was successfully negotiated on a connection) */ + +/* ECN stats */ + u_int16_t ecn_flags; +#define TE_SETUPSENT 0x0001 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ +#define TE_SETUPRECEIVED 0x0002 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ +#define TE_SENDIPECT 0x0004 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ +#define TE_SENDCWR 0x0008 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ +#define TE_SENDECE 0x0010 /* Indicate that the next packet should have the TCP ECE flag set */ +#define TE_INRECOVERY 0x0020 /* connection entered recovery after receiving ECE */ +#define TE_RECV_ECN_CE 0x0040 /* Received IPTOS_ECN_CE marking atleast once */ +#define TE_RECV_ECN_ECE 0x0080 /* Received ECE marking atleast once */ +#define TE_LOST_SYN 0x0100 /* Lost SYN with ECN setup */ +#define TE_LOST_SYNACK 0x0200 /* Lost SYN-ACK with ECN setup */ +#define TE_ECN_MODE_ENABLE 0x0400 /* Option ECN mode set to enable */ +#define TE_ECN_MODE_DISABLE 0x0800 /* Option ECN mode set to disable */ +#define TE_ENABLE_ECN 0x1000 /* Enable negotiation of ECN */ +#define TE_ECN_ON (TE_SETUPSENT | TE_SETUPRECEIVED) /* Indicate ECN was successfully negotiated on a connection) */ +#define TE_CEHEURI_SET 0x2000 /* We did our CE-probing at the beginning */ +#define TE_CLIENT_SETUP 0x4000 /* setup from client side */ + + u_int32_t t_ecn_recv_ce; /* Received CE from the network */ + u_int32_t t_ecn_recv_cwr; /* Packets received with CWR */ + u_int8_t t_ecn_recv_ce_pkt; /* Received packet with CE-bit set (independent from last_ack_sent) */ /* state for bad retransmit recovery */ u_int32_t snd_cwnd_prev; /* cwnd prior to retransmit */ @@ -445,7 +457,7 @@ struct tcpcb { uint32_t rtt_count; /* Number of RTT samples in recent base history */ uint32_t bg_ssthresh; /* Slow start threshold until delay increases */ uint32_t t_flagsext; /* Another field to accommodate more flags */ -#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ +#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ #define TF_RCVUNACK_WAITSS 0x2 /* set when the receiver should not stretch acks */ #define TF_BWMEAS_INPROGRESS 0x4 /* Indicate BW meas is happening */ #define TF_MEASURESNDBW 0x8 /* Measure send bw on this connection */ @@ -587,6 +599,13 @@ struct tcpcb { #define TFO_PROBE_PROBING 1 /* Sending out TCP-keepalives waiting for reply */ #define TFO_PROBE_WAIT_DATA 2 /* Received reply, waiting for data */ u_int8_t t_tfo_probe_state; + + u_int32_t t_rcvoopack; /* out-of-order packets received */ + u_int32_t t_pawsdrop; /* segments dropped due to PAWS */ + u_int32_t t_sack_recovery_episode; /* SACK recovery episodes */ + u_int32_t t_reordered_pkts; /* packets reorderd */ + u_int32_t t_dsack_sent; /* Sent DSACK notification */ + u_int32_t t_dsack_recvd; /* Received a valid DSACK option */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -1107,6 +1126,9 @@ struct tcpstat { u_int32_t tcps_ecn_conn_plnoce; /* Number of connections that received no CE and sufferred packet loss */ u_int32_t tcps_ecn_conn_pl_ce; /* Number of connections that received CE and sufferred packet loss */ u_int32_t tcps_ecn_conn_nopl_ce; /* Number of connections that received CE and sufferred no packet loss */ + u_int32_t tcps_ecn_fallback_synloss; /* Number of times we did fall back due to SYN-Loss */ + u_int32_t tcps_ecn_fallback_reorder; /* Number of times we fallback because we detected the PAWS-issue */ + u_int32_t tcps_ecn_fallback_ce; /* Number of times we fallback because we received too many CEs */ /* TFO-related statistics */ u_int32_t tcps_tfo_syn_data_rcv; /* Received a SYN+data with valid cookie */ @@ -1121,6 +1143,7 @@ struct tcpstat { u_int32_t tcps_tfo_blackhole; /* TFO got blackholed by a middlebox. */ }; + struct tcpstat_local { u_int64_t badformat; u_int64_t unspecv6; @@ -1467,6 +1490,7 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int32_t tcp_sbspace(struct tcpcb *tp); void tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp); +void tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp); void tcp_reset_stretch_ack(struct tcpcb *tp); extern void tcp_get_ports_used(u_int32_t, int, u_int32_t, bitstr_t *); uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index 28f53d5cc..bfd976bb7 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -412,8 +412,9 @@ ah4_input(struct mbuf *m, int off) * XXX more sanity checks * XXX relationship with gif? */ - u_int8_t tos; - + u_int8_t tos, otos; + int sum; + if (ifamily == AF_INET6) { ipseclog((LOG_NOTICE, "ipsec tunnel protocol mismatch " "in IPv4 AH input: %s\n", ipsec_logsastr(sav))); @@ -429,11 +430,21 @@ ah4_input(struct mbuf *m, int off) } } ip = mtod(m, struct ip *); + otos = ip->ip_tos; /* ECN consideration. */ if (ip_ecn_egress(ip4_ipsec_ecn, &tos, &ip->ip_tos) == 0) { IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto fail; } + + if (otos != ip->ip_tos) { + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + } + if (!key_checktunnelsanity(sav, AF_INET, (caddr_t)&ip->ip_src, (caddr_t)&ip->ip_dst)) { ipseclog((LOG_NOTICE, "ipsec tunnel address mismatch " diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index 277e6963e..1718e87bd 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -521,7 +521,8 @@ esp4_input(m, off) * XXX more sanity checks * XXX relationship with gif? */ - u_int8_t tos; + u_int8_t tos, otos; + int sum; tos = ip->ip_tos; m_adj(m, off + esplen + ivlen); @@ -537,10 +538,21 @@ esp4_input(m, off) } ip = mtod(m, struct ip *); /* ECN consideration. */ + + otos = ip->ip_tos; if (ip_ecn_egress(ip4_ipsec_ecn, &tos, &ip->ip_tos) == 0) { IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; } + + if (otos != ip->ip_tos) { + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + } + if (!key_checktunnelsanity(sav, AF_INET, (caddr_t)&ip->ip_src, (caddr_t)&ip->ip_dst)) { ipseclog((LOG_ERR, "ipsec tunnel address mismatch " @@ -1187,12 +1199,26 @@ esp6_input(struct mbuf **mp, int *offp, int proto) goto bad; } } + + u_int8_t otos; + int sum; + ip = mtod(m, struct ip *); + otos = ip->ip_tos; /* ECN consideration. */ if (ip46_ecn_egress(ip6_ipsec_ecn, &flowinfo, &ip->ip_tos) == 0) { IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; } + + if (otos != ip->ip_tos) { + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + } + if (!key_checktunnelsanity(sav, AF_INET, (caddr_t)&ip->ip_src, (caddr_t)&ip->ip_dst)) { ipseclog((LOG_ERR, "ipsec tunnel address mismatch " diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index 7058b3976..54840b69e 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -265,7 +265,9 @@ in6_gif_input(struct mbuf **mp, int *offp, int proto) case IPPROTO_IPV4: { struct ip *ip; - u_int8_t otos8; + u_int8_t otos8, old_tos; + int sum; + af = AF_INET; otos8 = (ntohl(otos) >> 20) & 0xff; if (mbuf_len(m) < sizeof (*ip)) { @@ -274,9 +276,17 @@ in6_gif_input(struct mbuf **mp, int *offp, int proto) return (IPPROTO_DONE); } ip = mtod(m, struct ip *); - if (gifp->if_flags & IFF_LINK1) + if (gifp->if_flags & IFF_LINK1) { + old_tos = ip->ip_tos; egress_success = ip_ecn_egress(ECN_NORMAL, &otos8, &ip->ip_tos); - else + if (old_tos != ip->ip_tos) { + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~old_tos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + } + } else egress_success = ip_ecn_egress(ECN_NOCARE, &otos8, &ip->ip_tos); break; } diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 660f8da4f..35888ba18 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -200,6 +200,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (EINVAL); if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) wild = 1; + socket_unlock(so, 0); /* keep reference */ lck_rw_lock_exclusive(pcbinfo->ipi_lock); @@ -367,6 +368,16 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } socket_lock(so, 0); + /* + * We unlocked socket's protocol lock for a long time. + * The socket might have been dropped/defuncted. + * Checking if world has changed since. + */ + if (inp->inp_state == INPCB_STATE_DEAD) { + lck_rw_done(pcbinfo->ipi_lock); + return (ECONNABORTED); + } + /* check if the socket got bound when the lock was released */ if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { lck_rw_done(pcbinfo->ipi_lock); diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 7767822d7..0bdaaa7e2 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -844,8 +844,6 @@ ip6_output_list(struct mbuf *m0, int packetchain, struct ip6_pktopts *opt, (mbuf_t *)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); - if (m != NULL) - m_freem(m); m = NULL; goto evaluateloop; } diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 8fe0d4d9e..ce0dfdf21 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -2692,11 +2692,12 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) case SIOCSPFXFLUSH_IN6: { /* struct in6_ifreq */ /* flush all the prefix advertised by routers */ - struct nd_prefix *next; + struct nd_prefix *next = NULL; lck_mtx_lock(nd6_mutex); for (pr = nd_prefix.lh_first; pr; pr = next) { - struct in6_ifaddr *ia; + struct in6_ifaddr *ia = NULL; + bool iterate_pfxlist_again = false; next = pr->ndpr_next; @@ -2741,9 +2742,8 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * The same applies for the prefix list. */ ia = in6_ifaddrs; - next = nd_prefix.lh_first; + iterate_pfxlist_again = true; continue; - } IFA_UNLOCK(&ia->ia_ifa); ia = ia->ia_next; @@ -2753,17 +2753,10 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) prelist_remove(pr); NDPR_UNLOCK(pr); pfxlist_onlink_check(); - /* - * If we were trying to restart this loop - * above by changing the value of 'next', we might - * end up freeing the only element on the list - * when we call NDPR_REMREF(). - * When this happens, we also have get out of this - * loop because we have nothing else to do. - */ - if (pr == next) - next = NULL; NDPR_REMREF(pr); + if (iterate_pfxlist_again) { + next = nd_prefix.lh_first; + } } lck_mtx_unlock(nd6_mutex); break; diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index 3414db347..0283fa918 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -518,15 +518,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t lck_mtx_unlock(cp->gss_clnt_mtx); } - MALLOC(ki, gss_key_info *, sizeof (gss_key_info), M_TEMP, M_WAITOK|M_ZERO); - if (ki == NULL) { - lck_mtx_unlock(&nmp->nm_lock); - return (ENOMEM); - } - - if (cp) { - cp->gss_clnt_kinfo = ki; - } else if (nfs_root_steals_ctx && principal == NULL && kauth_cred_getuid(req->r_cred) == 0) { + if (!cp && nfs_root_steals_ctx && principal == NULL && kauth_cred_getuid(req->r_cred) == 0) { /* * If superuser is trying to get access, then co-opt * the first valid context in the list. @@ -544,6 +536,12 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t } } + MALLOC(ki, gss_key_info *, sizeof (gss_key_info), M_TEMP, M_WAITOK|M_ZERO); + if (ki == NULL) { + lck_mtx_unlock(&nmp->nm_lock); + return (ENOMEM); + } + NFS_GSS_DBG("Context %s%sfound in Neg Cache @ %ld\n", NFS_GSS_CTX(req, cp), cp == NULL ? " not " : "", @@ -573,6 +571,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, uint32_t nfs_gss_clnt_mnt_ref(nmp); } } else { + cp->gss_clnt_kinfo = ki; nfs_gss_clnt_ctx_clean(cp); if (principal) { /* diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 12daa5588..1b082e748 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -669,6 +669,11 @@ fhopen( proc_t p, goto bad; } +#if CONFIG_MACF + if ((error = mac_vnode_check_open(ctx, vp, fmode))) + goto bad; +#endif + /* compute action to be authorized */ action = 0; if (fmode & FREAD) diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index ab38cc22a..639920f5e 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -1165,14 +1165,14 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *new_aia_p) my_new_cred = kauth_cred_setauditinfo(my_cred, &tmp_as); if (my_cred != my_new_cred) { - proc_lock(p); + proc_ucred_lock(p); /* Need to protect for a race where another thread also * changed the credential after we took our reference. * If p_ucred has changed then we should restart this * again with the new cred. */ if (p->p_ucred != my_cred) { - proc_unlock(p); + proc_ucred_unlock(p); audit_session_unref(my_new_cred); kauth_cred_unref(&my_new_cred); /* try again */ @@ -1182,7 +1182,7 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *new_aia_p) p->p_ucred = my_new_cred; /* update cred on proc */ PROC_UPDATE_CREDS_ONPROC(p); - proc_unlock(p); + proc_ucred_unlock(p); } /* * Drop old proc reference or our extra reference. @@ -1390,12 +1390,12 @@ audit_session_join_internal(proc_t p, ipc_port_t port, au_asid_t *new_asid) goto done; } - proc_lock(p); + proc_ucred_lock(p); kauth_cred_ref(p->p_ucred); my_cred = p->p_ucred; if (!IS_VALID_CRED(my_cred)) { kauth_cred_unref(&my_cred); - proc_unlock(p); + proc_ucred_unlock(p); err = ESRCH; goto done; } @@ -1421,7 +1421,7 @@ audit_session_join_internal(proc_t p, ipc_port_t port, au_asid_t *new_asid) /* Increment the proc count of new session */ audit_inc_procount(AU_SENTRY_PTR(new_aia_p)); - proc_unlock(p); + proc_ucred_unlock(p); /* Propagate the change from the process to the Mach task. */ set_security_token(p); @@ -1429,7 +1429,7 @@ audit_session_join_internal(proc_t p, ipc_port_t port, au_asid_t *new_asid) /* Decrement the process count of the former session. */ audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); } else { - proc_unlock(p); + proc_ucred_unlock(p); } kauth_cred_unref(&my_cred); diff --git a/bsd/sys/coalition.h b/bsd/sys/coalition.h index 62b205eb7..cf2811c5b 100644 --- a/bsd/sys/coalition.h +++ b/bsd/sys/coalition.h @@ -104,6 +104,21 @@ extern int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, in */ extern boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal); +/* + * coalition_get_leader: + * Get a task reference on the leader of a given coalition + * + * Parameters: + * coal : The coalition to investigate + * + * Returns: A referenced task pointer of the leader of the given coalition. + * This could be TASK_NULL if the coalition doesn't have a leader. + * If the return value is non-null, the caller is responsible to call + * task_deallocate on the returned value. + */ +extern task_t coalition_get_leader(coalition_t coal); + + /* * coalition_get_task_count: * Sum up the number of tasks in the given coalition diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 48dfac84a..d2e340aa2 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -323,6 +323,7 @@ extern void kernel_debug_string_simple(const char *message); #define DBG_MACH_SFI 0xA2 /* Selective Forced Idle (SFI) */ #define DBG_MACH_ENERGY_PERF 0xA3 /* Energy/performance resource stats */ #define DBG_MACH_SYSDIAGNOSE 0xA4 /* sysdiagnose keychord */ +#define DBG_MACH_ZALLOC 0xA5 /* Zone allocator */ /* Codes for Scheduler (DBG_MACH_SCHED) */ #define MACH_SCHED 0x0 /* Scheduler */ @@ -440,6 +441,10 @@ extern void kernel_debug_string_simple(const char *message); #define SFI_PID_SET_MANAGED 0x8 #define SFI_PID_CLEAR_MANAGED 0x9 #define SFI_GLOBAL_DEFER 0xa + +/* Codes for Zone Allocator (DBG_MACH_ZALLOC) */ +#define ZALLOC_ZCRAM 0x0 + /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */ #define DBG_NETIP 1 /* Internet Protocol */ #define DBG_NETARP 2 /* Address Resolution Protocol */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index 443861ddc..afd022407 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -844,6 +844,19 @@ struct proc_fileportinfo { #define LISTCOALITIONS_SINGLE_TYPE 2 #define LISTCOALITIONS_SINGLE_TYPE_SIZE (sizeof(struct procinfo_coalinfo)) +/* reasons for proc_can_use_foreground_hw */ +#define PROC_FGHW_OK 0 /* pid may use foreground HW */ +#define PROC_FGHW_DAEMON_OK 1 +#define PROC_FGHW_DAEMON_LEADER 10 /* pid is in a daemon coalition */ +#define PROC_FGHW_LEADER_NONUI 11 /* coalition leader is in a non-focal state */ +#define PROC_FGHW_LEADER_BACKGROUND 12 /* coalition leader is in a background state */ +#define PROC_FGHW_DAEMON_NO_VOUCHER 13 /* pid is a daemon with no adopted voucher */ +#define PROC_FGHW_NO_VOUCHER_ATTR 14 /* pid has adopted a voucher with no bank/originator attribute */ +#define PROC_FGHW_NO_ORIGINATOR 15 /* pid has adopted a voucher for a process that's gone away */ +#define PROC_FGHW_ORIGINATOR_BACKGROUND 16 /* pid has adopted a voucher for an app that's in the background */ +#define PROC_FGHW_VOUCHER_ERROR 98 /* error in voucher / originator callout */ +#define PROC_FGHW_ERROR 99 /* syscall parameter/permissions error */ + /* __proc_info() call numbers */ #define PROC_INFO_CALL_LISTPIDS 0x1 #define PROC_INFO_CALL_PIDINFO 0x2 @@ -856,6 +869,7 @@ struct proc_fileportinfo { #define PROC_INFO_CALL_PIDRUSAGE 0x9 #define PROC_INFO_CALL_PIDORIGINATORINFO 0xa #define PROC_INFO_CALL_LISTCOALITIONS 0xb +#define PROC_INFO_CALL_CANUSEFGHW 0xc #endif /* PRIVATE */ diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index a3d8487e5..f5ad0fcf4 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -93,6 +93,7 @@ __END_DECLS * PL = Process Lock * PGL = Process Group Lock * PFDL = Process File Desc Lock + * PUCL = Process User Credentials Lock * PSL = Process Spin Lock * PPL = Parent Process Lock (planed for later usage) * LL = List Lock @@ -221,9 +222,10 @@ struct proc { TAILQ_HEAD( ,eventqelt) p_evlist; /* (PL) */ lck_mtx_t p_fdmlock; /* proc lock to protect fdesc */ + lck_mtx_t p_ucred_mlock; /* mutex lock to protect p_ucred */ /* substructures: */ - kauth_cred_t p_ucred; /* Process owner's identity. (PL) */ + kauth_cred_t p_ucred; /* Process owner's identity. (PUCL) */ struct filedesc *p_fd; /* Ptr to open files structure. (PFDL) */ struct pstats *p_stats; /* Accounting/statistics (PL). */ struct plimit *p_limit; /* Process limits.(PL) */ @@ -661,6 +663,7 @@ extern lck_grp_t * proc_lck_grp; #if CONFIG_FINE_LOCK_GROUPS extern lck_grp_t * proc_mlock_grp; extern lck_grp_t * proc_fdmlock_grp; +extern lck_grp_t * proc_ucred_mlock_grp; extern lck_grp_t * proc_slock_grp; #endif extern lck_grp_attr_t * proc_lck_grp_attr; @@ -683,6 +686,8 @@ extern void proc_fdlock(struct proc *); extern void proc_fdlock_spin(struct proc *); extern void proc_fdunlock(struct proc *); extern void proc_fdlock_assert(proc_t p, int assertflags); +extern void proc_ucred_lock(struct proc *); +extern void proc_ucred_unlock(struct proc *); __private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid, char *cr_name, size_t cr_name_len); extern int isinferior(struct proc *, struct proc *); @@ -741,6 +746,7 @@ extern proc_t proc_parentholdref(proc_t); extern int proc_parentdropref(proc_t, int); int itimerfix(struct timeval *tv); int itimerdecr(struct proc * p, struct itimerval *itp, int usec); +int timespec_is_valid(const struct timespec *); void proc_signalstart(struct proc *, int locked); void proc_signalend(struct proc *, int locked); int proc_transstart(struct proc *, int locked, int non_blocking); diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 96f2519e5..d5f8ac636 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -271,6 +271,9 @@ #define SIOCGIFFUNCTIONALTYPE _IOWR('i', 173, struct ifreq) /* get interface functional type */ #define SIOCSIFNETSIGNATURE _IOWR('i', 174, struct if_nsreq) #define SIOCGIFNETSIGNATURE _IOWR('i', 175, struct if_nsreq) + +#define SIOCGECNMODE _IOWR('i', 176, struct ifreq) +#define SIOCSECNMODE _IOW('i', 177, struct ifreq) #endif /* PRIVATE */ #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 2cbbba27f..7d5a43c31 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -217,6 +217,7 @@ void bsd_timeout(void (*)(void *), void *arg, struct timespec * ts); void bsd_untimeout(void (*)(void *), void *arg); void set_fsblocksize(struct vnode *); uint64_t tvtoabstime(struct timeval *); +uint64_t tstoabstime(struct timespec *); void *throttle_info_create(void); void throttle_info_mount_ref(mount_t mp, void * throttle_info); void throttle_info_mount_rel(mount_t mp); diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 57fe3431a..c565a3df4 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -2405,6 +2405,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in goto wait_for_dwrites; } + task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE); while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { int throttle_type; @@ -6008,6 +6009,7 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) int retval = 0; int xsize; upl_page_info_t *pl; + int dirty_count; xsize = *io_resid; @@ -6044,10 +6046,13 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) pg_offset = upl_offset & PAGE_MASK; csize = min(PAGE_SIZE - pg_offset, xsize); + dirty_count = 0; while (xsize && retval == 0) { addr64_t paddr; paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset; + if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) + dirty_count++; retval = uiomove64(paddr, csize, uio); @@ -6060,6 +6065,7 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) uio->uio_segflg = segflg; + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, (int)uio->uio_offset, xsize, retval, segflg, 0); diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 23b21860a..714ba335a 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -2991,6 +2991,7 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, vo blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); blhdr->binfo[i].u.bp = bp; + task_update_logical_writes(current_task(), (2 * bsize), TASK_WRITE_METADATA); KERNEL_DEBUG_CONSTANT(0x3018004, VM_KERNEL_ADDRPERM(vp), blhdr->binfo[i].bnum, bsize, 0, 0); if (func) { diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 9fec68cd4..ca47a42da 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -7425,6 +7425,9 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i goto out; } + /* Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE */ + found_deny = TRUE; + KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); } out: @@ -7807,10 +7810,10 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ * If the size is being set, make sure it's not a directory. */ if (VATTR_IS_ACTIVE(vap, va_data_size)) { - /* size is meaningless on a directory, don't permit this */ - if (vnode_isdir(vp)) { - KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory"); - error = EISDIR; + /* size is only meaningful on regular files, don't permit otherwise */ + if (!vnode_isreg(vp)) { + KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file"); + error = vnode_isdir(vp) ? EISDIR : EINVAL; goto out; } } diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index a949a717d..adea23a19 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -414,6 +414,14 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 AUDIT_ARG(fflags, flags); +#if SECURE_KERNEL + if (flags & MNT_UNION) { + /* No union mounts on release kernels */ + error = EPERM; + goto out; + } +#endif + if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) { if (!(flags & MNT_UNION)) { @@ -431,7 +439,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 flags = (flags & ~(MNT_UPDATE)); } -#ifdef SECURE_KERNEL +#if SECURE_KERNEL if ((flags & MNT_RDONLY) == 0) { /* Release kernels are not allowed to mount "/" as rw */ error = EPERM; @@ -6500,7 +6508,7 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) return(EINVAL); } - NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1, + NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); if ((error = namei(&fromnd))) return (error); @@ -6555,8 +6563,6 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) out1: vnode_put(fvp); - if (fromnd.ni_startdir) - vnode_put(fromnd.ni_startdir); nameidone(&fromnd); if (error == -1) diff --git a/config/IOKit.exports b/config/IOKit.exports index 29c12c6f5..958678722 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -741,18 +741,6 @@ __ZN18IOTimerEventSourceC2EPK11OSMetaClass __ZN18IOTimerEventSourceC2Ev __ZN18IOTimerEventSourceD0Ev __ZN18IOTimerEventSourceD2Ev -__ZN18IOUserNotification10gMetaClassE -__ZN18IOUserNotification10superClassE -__ZN18IOUserNotification15setNotificationEP10IONotifier -__ZN18IOUserNotification4freeEv -__ZN18IOUserNotification4initEv -__ZN18IOUserNotification5resetEv -__ZN18IOUserNotification7isValidEv -__ZN18IOUserNotification9MetaClassC1Ev -__ZN18IOUserNotification9MetaClassC2Ev -__ZN18IOUserNotification9metaClassE -__ZN18IOUserNotificationC2EPK11OSMetaClass -__ZN18IOUserNotificationD2Ev __ZN18_IOServiceNotifier10gMetaClassE __ZN18_IOServiceNotifier10superClassE __ZN18_IOServiceNotifier4freeEv diff --git a/config/MasterVersion b/config/MasterVersion index 3aded3442..a378f284b 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -15.0.0 +15.2.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index bea0d10f0..061c64ae3 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -110,5 +110,4 @@ enum { kIOClassNameOverrideNone = 0x00000001, }; - #endif /* ! _IOKIT_IOKITKEYSPRIVATE_H */ diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 21ca1a97a..2ecc15e63 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -507,6 +507,7 @@ class IOPMrootDomain: public IOService bool activitySinceSleep(void); bool abortHibernation(void); + void updateConsoleUsers(void); IOReturn joinAggressiveness( IOService * service ); void handleAggressivesRequests( void ); @@ -718,6 +719,7 @@ class IOPMrootDomain: public IOService unsigned int toldPowerdCapWillChange :1; unsigned int displayPowerOnRequested:1; + uint8_t tasksSuspended; uint32_t hibernateMode; AbsoluteTime userActivityTime; AbsoluteTime userActivityTime_prev; diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 17cd841bc..ebaa3087d 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -297,7 +297,10 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const && (current_task() == kernel_task || mac_iokit_check_nvram_get(kauth_cred_get(), key->getCStringNoCopy()) == 0) #endif ) { } - else dict->removeObject(key); + else { + dict->removeObject(key); + iter->reset(); + } } } diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index a27622128..2d6c7b79f 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -664,8 +664,19 @@ IOPMrootDomain * IOPMrootDomain::construct( void ) //****************************************************************************** static void updateConsoleUsersCallout(thread_call_param_t p0, thread_call_param_t p1) +{ + IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0; + rootDomain->updateConsoleUsers(); +} + +void IOPMrootDomain::updateConsoleUsers(void) { IOService::updateConsoleUsers(NULL, kIOMessageSystemHasPoweredOn); + if (tasksSuspended) + { + tasksSuspended = FALSE; + tasks_system_suspend(tasksSuspended); + } } //****************************************************************************** @@ -3077,6 +3088,16 @@ void IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState if (SLEEP_STATE == newPowerState) { + if (!tasksSuspended) + { + AbsoluteTime deadline; + tasksSuspended = TRUE; + tasks_system_suspend(tasksSuspended); + + clock_interval_to_deadline(10, kSecondScale, &deadline); + vm_pageout_wait(AbsoluteTime_to_scalar(&deadline)); + } + #if HIBERNATION IOHibernateSystemSleep(); IOHibernateIOKitSleep(); diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index bed5b5e4e..6ff5f289d 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -411,6 +411,7 @@ void IOService::initialize( void ) gIOStopProviderList = OSArray::withCapacity( 16 ); gIOFinalizeList = OSArray::withCapacity( 16 ); assert( gIOTerminatePhase2List && gIOStopList && gIOStopProviderList && gIOFinalizeList ); + } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -4039,6 +4040,7 @@ OSObject * IOService::copyExistingServices( OSDictionary * matching, const OSSymbol * sym = OSSymbol::withString(str); OSMetaClass::applyToInstancesOfClassName(sym, instanceMatch, &ctx); sym->release(); + } else { @@ -5071,7 +5073,9 @@ bool IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * { count = table->getCount(); done = 0; + str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey)); + if (str) { done++; match = ((kIOServiceClassDone & options) || (0 != metaCast(str))); @@ -5232,6 +5236,7 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) do { count = table->getCount(); + if (!(kIOServiceInternalDone & options)) { match = where->matchInternal(table, options, &done); @@ -5244,7 +5249,7 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) // do family specific matching match = where->matchPropertyTable( table, &score ); - + if( !match) { #if IOMATCHDEBUG if( kIOLogMatch & getDebugFlags( table )) @@ -5267,7 +5272,8 @@ bool IOService::matchPassive(OSDictionary * table, uint32_t options) nextTable = OSDynamicCast(OSDictionary, table->getObject( gIOParentMatchKey )); - if( nextTable) { + if(nextTable) { + // look for a matching entry anywhere up to root match = false; matchParent = true; diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 6c9ec5df7..8c628ddf1 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -354,16 +354,32 @@ void IOMachPort::free( void ) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -class IOUserNotification : public OSIterator +class IOUserIterator : public OSIterator +{ + OSDeclareDefaultStructors(IOUserIterator) +public: + OSObject * userIteratorObject; + IOLock * lock; + + static IOUserIterator * withIterator(OSIterator * iter); + virtual bool init( void ) APPLE_KEXT_OVERRIDE; + virtual void free() APPLE_KEXT_OVERRIDE; + + virtual void reset() APPLE_KEXT_OVERRIDE; + virtual bool isValid() APPLE_KEXT_OVERRIDE; + virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE; +}; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +class IOUserNotification : public IOUserIterator { OSDeclareDefaultStructors(IOUserNotification) - IONotifier * holdNotify; - IOLock * lock; +#define holdNotify userIteratorObject public: - virtual bool init( void ) APPLE_KEXT_OVERRIDE; virtual void free() APPLE_KEXT_OVERRIDE; virtual void setNotification( IONotifier * obj ); @@ -372,6 +388,84 @@ class IOUserNotification : public OSIterator virtual bool isValid() APPLE_KEXT_OVERRIDE; }; +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +OSDefineMetaClassAndStructors( IOUserIterator, OSIterator ) + +IOUserIterator * +IOUserIterator::withIterator(OSIterator * iter) +{ + IOUserIterator * me; + + if (!iter) return (0); + + me = new IOUserIterator; + if (me && !me->init()) + { + me->release(); + me = 0; + } + if (!me) return me; + me->userIteratorObject = iter; + + return (me); +} + +bool +IOUserIterator::init( void ) +{ + if (!OSObject::init()) return (false); + + lock = IOLockAlloc(); + if( !lock) + return( false ); + + return (true); +} + +void +IOUserIterator::free() +{ + if (userIteratorObject) userIteratorObject->release(); + if (lock) IOLockFree(lock); + OSObject::free(); +} + +void +IOUserIterator::reset() +{ + IOLockLock(lock); + assert(OSDynamicCast(OSIterator, userIteratorObject)); + ((OSIterator *)userIteratorObject)->reset(); + IOLockUnlock(lock); +} + +bool +IOUserIterator::isValid() +{ + bool ret; + + IOLockLock(lock); + assert(OSDynamicCast(OSIterator, userIteratorObject)); + ret = ((OSIterator *)userIteratorObject)->isValid(); + IOLockUnlock(lock); + + return (ret); +} + +OSObject * +IOUserIterator::getNextObject() +{ + OSObject * ret; + + IOLockLock(lock); + assert(OSDynamicCast(OSIterator, userIteratorObject)); + ret = ((OSIterator *)userIteratorObject)->getNextObject(); + IOLockUnlock(lock); + + return (ret); +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ extern "C" { @@ -513,40 +607,29 @@ class IOServiceMessageUserNotification : public IOUserNotification /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #undef super -#define super OSIterator -OSDefineMetaClass( IOUserNotification, OSIterator ) -OSDefineAbstractStructors( IOUserNotification, OSIterator ) +#define super IOUserIterator +OSDefineMetaClass( IOUserNotification, IOUserIterator ) +OSDefineAbstractStructors( IOUserNotification, IOUserIterator ) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -bool IOUserNotification::init( void ) -{ - if( !super::init()) - return( false ); - - lock = IOLockAlloc(); - if( !lock) - return( false ); - - return( true ); -} - void IOUserNotification::free( void ) { - if( holdNotify) - holdNotify->remove(); + if (holdNotify) + { + assert(OSDynamicCast(IONotifier, holdNotify)); + ((IONotifier *)holdNotify)->remove(); + holdNotify = 0; + } // can't be in handler now - if( lock) - IOLockFree( lock ); - super::free(); } void IOUserNotification::setNotification( IONotifier * notify ) { - IONotifier * previousNotify; + OSObject * previousNotify; IOLockLock( gIOObjectPortLock); @@ -556,7 +639,10 @@ void IOUserNotification::setNotification( IONotifier * notify ) IOLockUnlock( gIOObjectPortLock); if( previousNotify) - previousNotify->remove(); + { + assert(OSDynamicCast(IONotifier, previousNotify)); + ((IONotifier *)previousNotify)->remove(); + } } void IOUserNotification::reset() @@ -1503,6 +1589,14 @@ extern "C" { if( !(out = OSDynamicCast( cls, obj))) \ return( kIOReturnBadArgument ) +#define CHECKLOCKED(cls,obj,out) \ + IOUserIterator * oIter; \ + cls * out; \ + if( !(oIter = OSDynamicCast(IOUserIterator, obj))) \ + return (kIOReturnBadArgument); \ + if( !(out = OSDynamicCast(cls, oIter->userIteratorObject))) \ + return (kIOReturnBadArgument) + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // Create a vm_map_copy_t or kalloc'ed data for memory @@ -1671,6 +1765,7 @@ kern_return_t is_io_iterator_next( io_object_t iterator, io_object_t *object ) { + IOReturn ret; OSObject * obj; CHECK( OSIterator, iterator, iter ); @@ -1679,9 +1774,11 @@ kern_return_t is_io_iterator_next( if( obj) { obj->retain(); *object = obj; - return( kIOReturnSuccess ); + ret = kIOReturnSuccess; } else - return( kIOReturnNoDevice ); + ret = kIOReturnNoDevice; + + return (ret); } /* Routine io_iterator_reset */ @@ -1723,6 +1820,7 @@ static kern_return_t internal_io_service_match_property_table( obj = matching_size ? OSUnserializeXML(matching, matching_size) : OSUnserializeXML(matching); if( (dict = OSDynamicCast( OSDictionary, obj))) { + *matches = service->passiveMatch( dict ); kr = kIOReturnSuccess; } else @@ -1795,7 +1893,7 @@ static kern_return_t internal_io_service_get_matching_services( obj = matching_size ? OSUnserializeXML(matching, matching_size) : OSUnserializeXML(matching); if( (dict = OSDynamicCast( OSDictionary, obj))) { - *existing = IOService::getMatchingServices( dict ); + *existing = IOUserIterator::withIterator(IOService::getMatchingServices( dict )); kr = kIOReturnSuccess; } else kr = kIOReturnBadArgument; @@ -2277,8 +2375,9 @@ kern_return_t is_io_registry_create_iterator( if( master_port != master_device_port) return( kIOReturnNotPrivileged); - *iterator = IORegistryIterator::iterateOver( - IORegistryEntry::getPlane( plane ), options ); + *iterator = IOUserIterator::withIterator( + IORegistryIterator::iterateOver( + IORegistryEntry::getPlane( plane ), options )); return( *iterator ? kIOReturnSuccess : kIOReturnBadArgument ); } @@ -2292,8 +2391,9 @@ kern_return_t is_io_registry_entry_create_iterator( { CHECK( IORegistryEntry, registry_entry, entry ); - *iterator = IORegistryIterator::iterateOver( entry, - IORegistryEntry::getPlane( plane ), options ); + *iterator = IOUserIterator::withIterator( + IORegistryIterator::iterateOver( entry, + IORegistryEntry::getPlane( plane ), options )); return( *iterator ? kIOReturnSuccess : kIOReturnBadArgument ); } @@ -2302,9 +2402,11 @@ kern_return_t is_io_registry_entry_create_iterator( kern_return_t is_io_registry_iterator_enter_entry( io_object_t iterator ) { - CHECK( IORegistryIterator, iterator, iter ); + CHECKLOCKED( IORegistryIterator, iterator, iter ); + IOLockLock(oIter->lock); iter->enterEntry(); + IOLockUnlock(oIter->lock); return( kIOReturnSuccess ); } @@ -2315,9 +2417,11 @@ kern_return_t is_io_registry_iterator_exit_entry( { bool didIt; - CHECK( IORegistryIterator, iterator, iter ); + CHECKLOCKED( IORegistryIterator, iterator, iter ); + IOLockLock(oIter->lock); didIt = iter->exitEntry(); + IOLockUnlock(oIter->lock); return( didIt ? kIOReturnSuccess : kIOReturnNoDevice ); } diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index ee046b925..fb623933c 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -75,6 +75,12 @@ EXPORT_MI_LIST = \ kxld_types.h \ stack_protector.h +INSTALL_KF_MI_LCL_LIST += \ + section_keywords.h + +EXPORT_MI_LIST += \ + section_keywords.h + EXPORT_MI_GEN_LIST = version.h EXPORT_MI_DIR = libkern diff --git a/libkern/libkern/section_keywords.h b/libkern/libkern/section_keywords.h new file mode 100644 index 000000000..511d9db46 --- /dev/null +++ b/libkern/libkern/section_keywords.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SECTION_KEYWORDS_H +#define _SECTION_KEYWORDS_H + + +/* Default behaviour */ +#ifndef SECURITY_READ_ONLY_EARLY +#define __PLACE_IN_SECTION(__segment__section) \ + __attribute__((used, section(__segment__section))) + +#define SECURITY_READ_ONLY_SPECIAL_SECTION(_t,__segment__section) \ + const _t __PLACE_IN_SECTION(__segment__section) + +#define SECURITY_READ_ONLY_EARLY(_t) const _t + +#define SECURITY_READ_ONLY_LATE(_t) _t + +#define SECURITY_READ_WRITE(_t) _t __attribute__((used)) +#endif /* SECURITY_READ_ONLY_EARLY */ + + +#endif /* _SECTION_KEYWORDS_H_ */ diff --git a/libkern/zlib/zutil.c b/libkern/zlib/zutil.c index 020291bc1..f90ac37a7 100644 --- a/libkern/zlib/zutil.c +++ b/libkern/zlib/zutil.c @@ -331,8 +331,15 @@ voidpf zcalloc (opaque, items, size) unsigned size; { if (opaque) items += size - size; /* make compiler happy */ - return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) : - (voidpf)calloc(items, size); + if (sizeof(uInt) > 2) { + /* + to prevent use of uninitialized memory, malloc and bzero + */ + voidpf p = malloc(items * size); + bzero(p, items * size); + return p; + } else + return (voidpf)calloc(items, size); } void zcfree (opaque, ptr) diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index 824393376..10371199f 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -114,8 +114,13 @@ 74119F46188F3B6A00C6F48F /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; 7466C924170CBA53004557CC /* vm_page_size.h in Headers */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; 74F3290B18EB269400B2B70E /* vm_page_size.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = 7466C923170CB99B004557CC /* vm_page_size.h */; }; + 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; }; + 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; }; 7AE28FDF18AC41B1006A5626 /* csr.c in Sources */ = {isa = PBXBuildFile; fileRef = 7AE28FDE18AC41B1006A5626 /* csr.c */; }; 9002401118FC9A7F00D73BFA /* rename_ext.c in Sources */ = {isa = PBXBuildFile; fileRef = 906AA2D018F74CD1001C681A /* rename_ext.c */; }; + 928336A11B83ED9100873B90 /* thread_register_state.c in Sources */ = {isa = PBXBuildFile; fileRef = 928336A01B83ED7800873B90 /* thread_register_state.c */; }; + 9299E14A1B841E74005B7350 /* thread_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 928336A21B8412C100873B90 /* thread_state.h */; }; + 9299E14B1B841F59005B7350 /* thread_state.h in Headers */ = {isa = PBXBuildFile; fileRef = 928336A21B8412C100873B90 /* thread_state.h */; }; A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = A59CB95516669DB700B064B3 /* stack_logging_internal.h */; }; A59CB9581666A1A200B064B3 /* munmap.c in Sources */ = {isa = PBXBuildFile; fileRef = A59CB9571666A1A200B064B3 /* munmap.c */; }; BA0D9FB1199031AD007E8A73 /* kdebug_trace.c in Sources */ = {isa = PBXBuildFile; fileRef = BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */; }; @@ -228,7 +233,7 @@ E4D45C3F16FB20D30002AF25 /* spawn.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D45C3D16FB20970002AF25 /* spawn.h */; settings = {ATTRIBUTES = (Public, ); }; }; E4D45C4016FB20DC0002AF25 /* spawn_private.h in Headers */ = {isa = PBXBuildFile; fileRef = E4D45C3E16FB20970002AF25 /* spawn_private.h */; settings = {ATTRIBUTES = (Private, ); }; }; E4D7E55C16F8776300F92D8D /* index.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55316F8776300F92D8D /* index.c */; }; - E4D7E55E16F8776300F92D8D /* memset.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55516F8776300F92D8D /* memset.c */; }; + E4D7E55E16F8776300F92D8D /* memset.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55516F8776300F92D8D /* memset.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; }; E4D7E55F16F8776300F92D8D /* strcmp.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55616F8776300F92D8D /* strcmp.c */; }; E4D7E56016F8776300F92D8D /* strcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55716F8776300F92D8D /* strcpy.c */; }; E4D7E56116F8776300F92D8D /* strlcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = E4D7E55916F8776300F92D8D /* strlcpy.c */; }; @@ -436,8 +441,12 @@ 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = ""; }; 72B1E6EC190723DB00FB3FA2 /* guarded_open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_dprotected_np.c; sourceTree = ""; }; 7466C923170CB99B004557CC /* vm_page_size.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = vm_page_size.h; sourceTree = ""; }; + 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = ""; }; + 978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = ""; }; 7AE28FDE18AC41B1006A5626 /* csr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = csr.c; sourceTree = ""; }; 906AA2D018F74CD1001C681A /* rename_ext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = rename_ext.c; sourceTree = ""; }; + 928336A01B83ED7800873B90 /* thread_register_state.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = thread_register_state.c; sourceTree = ""; }; + 928336A21B8412C100873B90 /* thread_state.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_state.h; sourceTree = ""; }; A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = ""; }; A59CB9571666A1A200B064B3 /* munmap.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = munmap.c; sourceTree = ""; }; BA0D9FB0199031AD007E8A73 /* kdebug_trace.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kdebug_trace.c; sourceTree = ""; }; @@ -728,6 +737,7 @@ C6460B7B182025DF00F73CCA /* sfi.c */, 24B223B3121DFF12007DAEDE /* sigsuspend-base.c */, 13B598931A142F5900DB2D5A /* stackshot.c */, + 928336A01B83ED7800873B90 /* thread_register_state.c */, 248AA962122C7B2A0085F5B1 /* unlink.c */, 29A59AE5183B110C00E8B896 /* unlinkat.c */, 374A36E214748EE400AAF39D /* varargs_wrappers.s */, @@ -740,6 +750,8 @@ 248BA04A121C8EE4008C073F /* cancelable */ = { isa = PBXGroup; children = ( + 978228271B8678CB008385AC /* pselect-darwinext.c */, + 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */, 248BA04B121C8EE4008C073F /* fcntl-base.c */, 248BA04E121C8F06008C073F /* fcntl.c */, 248BA051121C8FE2008C073F /* fcntl-cancel.c */, @@ -910,6 +922,7 @@ C9D9BCDD114B00600000D8B9 /* mach_interface.h */, C9D9BCDF114B00600000D8B9 /* port_obj.h */, C9D9BCE0114B00600000D8B9 /* sync.h */, + 928336A21B8412C100873B90 /* thread_state.h */, C9D9BCE3114B00600000D8B9 /* vm_task.h */, 7466C923170CB99B004557CC /* vm_page_size.h */, ); @@ -979,6 +992,7 @@ C6C401241741566D000AE69F /* gethostuuid_private.h in Headers */, C6D3EFB916542C510052CF30 /* mach.h in Headers */, C6D3EFBA16542C510052CF30 /* mach_error.h in Headers */, + 9299E14B1B841F59005B7350 /* thread_state.h in Headers */, C6D3EFBB16542C510052CF30 /* mach_init.h in Headers */, C6D3EFBC16542C510052CF30 /* mach_interface.h in Headers */, C6D3EFBD16542C510052CF30 /* port_obj.h in Headers */, @@ -1012,6 +1026,7 @@ C9D9BD26114B00600000D8B9 /* mach.h in Headers */, C9D9BD27114B00600000D8B9 /* mach_error.h in Headers */, C9D9BD28114B00600000D8B9 /* mach_init.h in Headers */, + 9299E14A1B841E74005B7350 /* thread_state.h in Headers */, C6C40122174155E3000AE69F /* gethostuuid_private.h in Headers */, C9D9BD29114B00600000D8B9 /* mach_interface.h in Headers */, C9D9BD2B114B00600000D8B9 /* port_obj.h in Headers */, @@ -1251,6 +1266,7 @@ C9D9BD56114B00600000D8B9 /* slot_name.c in Sources */, 24484A7511F6178E00E10CD2 /* string.c in Sources */, E453AF351700FD3C00F2C94C /* getiopolicy_np.c in Sources */, + 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */, 2485235511582D8F0051B413 /* mach_legacy.c in Sources */, 242AB66611EBDC1200107336 /* errno.c in Sources */, E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */, @@ -1271,6 +1287,7 @@ 24A7C5C711FF8DA6007669EB /* sendto.c in Sources */, 24A7C5C811FF8DA6007669EB /* setattrlist.c in Sources */, 24A7C5C911FF8DA6007669EB /* socketpair.c in Sources */, + 928336A11B83ED9100873B90 /* thread_register_state.c in Sources */, 9002401118FC9A7F00D73BFA /* rename_ext.c in Sources */, 2419382B12135FF6003CDE41 /* chmod.c in Sources */, 248BA01D121C56BF008C073F /* connect.c in Sources */, @@ -1310,6 +1327,7 @@ 248AA967122C7CDA0085F5B1 /* rename.c in Sources */, 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */, C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */, + 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */, C99A4F531305B43F0054B7B7 /* init_cpu_capabilities.c in Sources */, 030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */, E4D45C3116F868ED0002AF25 /* proc_listpidspath.c in Sources */, diff --git a/libsyscall/mach/err_kern.sub b/libsyscall/mach/err_kern.sub index bc059a5dd..0cfc2646a 100644 --- a/libsyscall/mach/err_kern.sub +++ b/libsyscall/mach/err_kern.sub @@ -85,6 +85,11 @@ static const char * const err_codes_kern[] = { "(os/kern) let orphan continue", /* 45 */ "(os/kern) service not supported", "(os/kern) remote node down", + "(os/kern) thread not waiting", + "(os/kern) operation timed out", + "(os/kern) code signing error", /* 50 */ + "(os/kern) policy is static", + "(os/kern) insufficient input buffer size", }; static const char * const err_codes_unix[] = { diff --git a/libsyscall/mach/mach/thread_state.h b/libsyscall/mach/mach/thread_state.h new file mode 100644 index 000000000..42abc4ec2 --- /dev/null +++ b/libsyscall/mach/mach/thread_state.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _MACH_THREAD_STATE_H_ +#define _MACH_THREAD_STATE_H_ + +#include +#include + +#ifndef KERNEL +/* + * Gets all register values in the target thread with pointer-like contents. + * There's no guarantee that the returned values are valid pointers, but all + * valid pointers will be returned. The order and count of the provided + * register values is unspecified and may change; registers with values that + * are not valid pointers may be omitted, so the number of pointers returned + * may vary from call to call. + * + * sp is an out parameter that will contain the stack pointer + * length is an in/out parameter for the length of the values array + * values is an array of pointers + * + * This may only be called on threads in the current task. If the current + * platform defines a stack red zone, the stack pointer returned will be + * adjusted to account for red zone. + * + * If length is insufficient KERN_INSUFFICIENT_BUFFER_SIZE will be returned and + * length set to the amount of memory required. Callers MUST NOT assume that + * any particular size of buffer will be sufficient and should retry with an + * aproproately sized buffer upon this error. + */ +__OSX_UNAVAILABLE +__IOS_UNAVAILABLE +__TVOS_AVAILABLE(9.0) +__WATCHOS_UNAVAILABLE +kern_return_t thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *length, uintptr_t *values); +#endif + +#endif /* _MACH_THREAD_STATE_H_ */ diff --git a/libsyscall/mach/tvos_prohibited_mig.txt b/libsyscall/mach/tvos_prohibited_mig.txt new file mode 100644 index 000000000..5b432b264 --- /dev/null +++ b/libsyscall/mach/tvos_prohibited_mig.txt @@ -0,0 +1,47 @@ +__TVOS_PROHIBITED +thread_terminate +act_get_state +act_set_state +thread_depress_abort +thread_get_special_port +thread_set_special_port +thread_set_exception_ports +thread_get_exception_ports +thread_swap_exception_ports +thread_get_mach_voucher +thread_set_mach_voucher +thread_swap_mach_voucher +mach_ports_register +mach_ports_lookup +task_suspend +task_resume +task_set_info +task_get_special_port +task_set_special_port +thread_create +thread_create_running +task_set_exception_ports +task_get_exception_ports +task_swap_exception_ports +task_policy_set +task_policy_get +task_zone_info +task_get_state +task_set_state +task_set_phys_footprint_limit +task_suspend2 +task_resume2 +task_get_mach_voucher +task_set_mach_voucher +task_swap_mach_voucher +task_set_port_space +host_request_notification +host_info +task_wire +mach_port_allocate_name +host_create_mach_voucher +host_register_mach_voucher_attr_manager +host_register_well_known_mach_voucher_attr_manager +host_set_atm_diagnostic_flag +host_get_atm_diagnostic_flag + diff --git a/libsyscall/mach/watchos_prohibited_mig.txt b/libsyscall/mach/watchos_prohibited_mig.txt index 4d27c6243..5201c0165 100644 --- a/libsyscall/mach/watchos_prohibited_mig.txt +++ b/libsyscall/mach/watchos_prohibited_mig.txt @@ -1,4 +1,4 @@ -__WATCHOS_PROHIBITED __TVOS_PROHIBITED +__WATCHOS_PROHIBITED thread_terminate act_get_state act_set_state diff --git a/libsyscall/wrappers/cancelable/pselect-darwinext-cancel.c b/libsyscall/wrappers/cancelable/pselect-darwinext-cancel.c new file mode 100644 index 000000000..54ea91375 --- /dev/null +++ b/libsyscall/wrappers/cancelable/pselect-darwinext-cancel.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#define VARIANT_CANCELABLE +#define VARIANT_DARWIN_EXTSN + +#include "../select-base.c" diff --git a/libsyscall/wrappers/cancelable/pselect-darwinext.c b/libsyscall/wrappers/cancelable/pselect-darwinext.c new file mode 100644 index 000000000..4bfb1b756 --- /dev/null +++ b/libsyscall/wrappers/cancelable/pselect-darwinext.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#undef __DARWIN_NON_CANCELABLE +#define __DARWIN_NON_CANCELABLE 1 +#define VARIANT_DARWIN_EXTSN + +#include "../select-base.c" diff --git a/libsyscall/wrappers/select-base.c b/libsyscall/wrappers/select-base.c index 09f8816f6..f688d6f36 100644 --- a/libsyscall/wrappers/select-base.c +++ b/libsyscall/wrappers/select-base.c @@ -26,24 +26,37 @@ #define __DARWIN_NON_CANCELABLE 0 #endif /* __LP64__ && (VARIANT_CANCELABLE || VARIANT_PRE1050) */ +#if defined(VARIANT_DARWIN_EXTSN) +#define _DARWIN_C_SOURCE +#define _DARWIN_UNLIMITED_SELECT +#endif + #include +#include +#include #include "_errno.h" #if defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050) +#if !defined(VARIANT_DARWIN_EXTSN) extern int __select(int, fd_set * __restrict, fd_set * __restrict, fd_set * __restrict, struct timeval * __restrict); +#endif +int __pselect(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, const struct timespec * __restrict, const sigset_t * __restrict); #else /* !VARIANT_CANCELABLE && !VARIANT_PRE1050 */ +#if !defined(VARIANT_DARWIN_EXTSN) int __select_nocancel(int, fd_set * __restrict, fd_set * __restrict, fd_set * __restrict, struct timeval * __restrict); +#endif +int __pselect_nocancel(int, fd_set * __restrict, fd_set * __restrict, + fd_set * __restrict, const struct timespec * __restrict, const sigset_t * __restrict); #endif /* VARIANT_CANCELABLE || VARIANT_PRE1050 */ +#if !defined(VARIANT_DARWIN_EXTSN) /* - * select stub, return error if nfds > FD_SETSIZE - * add pthread cancelability - * mandated for conformance. - * - * This is only for (non DARWINEXTSN) UNIX03 (both cancelable and - * non-cancelable) and for legacy + * select() implementation for 1050 and legacy (cancelable and non-cancelable) + * variants. The darwin extension variants (both cancelable & non-cancelable) are + * mapped directly to the syscall stub. */ int select(int nfds, fd_set * __restrict readfds, fd_set * __restrict writefds, @@ -55,7 +68,6 @@ select(int nfds, fd_set * __restrict readfds, fd_set * __restrict writefds, #endif /* VARIANT_LEGACY || VARIANT_PRE1050 */ ) { - #if defined(VARIANT_LEGACY) || defined(VARIANT_PRE1050) struct timeval tb, *timeout; @@ -66,17 +78,111 @@ select(int nfds, fd_set * __restrict readfds, fd_set * __restrict writefds, tb.tv_sec = 0; tb.tv_usec = 10000; timeout = &tb; - } else + } else { timeout = intimeout; + } #else /* !VARIANT_LEGACY && !VARIANT_PRE1050 */ if (nfds > FD_SETSIZE) { errno = EINVAL; return -1; } -#endif /* VARIANT_LEGACY || VARIANT_PRE1050 */ +#endif + #if defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050) return __select(nfds, readfds, writefds, exceptfds, timeout); #else /* !VARIANT_CANCELABLE && !VARIANT_PRE1050 */ return __select_nocancel(nfds, readfds, writefds, exceptfds, timeout); #endif /* VARIANT_CANCELABLE || VARIANT_PRE1050 */ } +#endif /* !defined(VARIANT_DARWIN_EXTSN) */ + + +/* + * User-space emulation of pselect() syscall for B&I + * TODO: remove when B&I move to xnu with native pselect() + */ +extern int __pthread_sigmask(int, const sigset_t *, sigset_t *); +static int +_pselect_emulated(int count, fd_set * __restrict rfds, fd_set * __restrict wfds, + fd_set * __restrict efds, const struct timespec * __restrict timo, + const sigset_t * __restrict mask) +{ + sigset_t omask; + struct timeval tvtimo, *tvp; + int rv, sverrno; + + if (timo) { + tvtimo.tv_sec = timo->tv_sec; + tvtimo.tv_usec = (__darwin_suseconds_t)(timo->tv_nsec / 1000); + tvp = &tvtimo; + } else { + tvp = 0; + } + + if (mask != 0) { + rv = __pthread_sigmask(SIG_SETMASK, mask, &omask); + if (rv != 0) + return rv; + } + + rv = select(count, rfds, wfds, efds, tvp); + if (mask != 0) { + sverrno = errno; + __pthread_sigmask(SIG_SETMASK, &omask, (sigset_t *)0); + errno = sverrno; + } + + return rv; +} + +/* + * pselect() implementation for all variants. Unlike select(), we implement the + * darwin extension variants here to catch cases where xnu doesn't implement + * pselect and we need to emulate. + */ +int +pselect(int nfds, fd_set * __restrict readfds, fd_set * __restrict writefds, + fd_set * __restrict exceptfds, const struct timespec * __restrict +#if defined(VARIANT_LEGACY) || defined(VARIANT_PRE1050) + intimeout, +#else /* !VARIANT_LEGACY && !VARIANT_PRE1050 */ + timeout, +#endif /* VARIANT_LEGACY || VARIANT_PRE1050 */ + const sigset_t * __restrict sigmask) +{ + int ret; +#if defined(VARIANT_LEGACY) || defined(VARIANT_PRE1050) + struct timespec tb; + const struct timespec *timeout; + + /* + * Legacy select behavior is minimum 10 msec when tv_usec is non-zero + */ + if (intimeout && intimeout->tv_sec == 0 && intimeout->tv_nsec > 0 && intimeout->tv_nsec < 10000000L) { + tb.tv_sec = 0; + tb.tv_nsec = 10000000L; + timeout = &tb; + } else { + timeout = intimeout; + } +#elif defined(VARIANT_DARWIN_EXTSN) +#else + /* 1050 variant */ + if (nfds > FD_SETSIZE) { + errno = EINVAL; + return -1; + } +#endif + +#if defined(VARIANT_CANCELABLE) || defined(VARIANT_PRE1050) + ret = __pselect(nfds, readfds, writefds, exceptfds, timeout, sigmask); +#else /* !VARIANT_CANCELABLE && !VARIANT_PRE1050 */ + ret = __pselect_nocancel(nfds, readfds, writefds, exceptfds, timeout, sigmask); +#endif /* VARIANT_CANCELABLE || VARIANT_PRE1050 */ + + if (ret == -1 && errno == ENOSYS) { + ret = _pselect_emulated(nfds, readfds, writefds, exceptfds, timeout, sigmask); + } + + return ret; +} diff --git a/libsyscall/wrappers/string/memset.c b/libsyscall/wrappers/string/memset.c index 3ae3c7717..cab6587d6 100644 --- a/libsyscall/wrappers/string/memset.c +++ b/libsyscall/wrappers/string/memset.c @@ -36,6 +36,8 @@ #define wsize sizeof(u_int) #define wmask (wsize - 1) +// n.b. this must be compiled with -fno-builtin or it might get optimized into +// a recursive call to bzero. __attribute__((visibility("hidden"))) void bzero(void *dst0, size_t length) diff --git a/libsyscall/wrappers/thread_register_state.c b/libsyscall/wrappers/thread_register_state.c new file mode 100644 index 000000000..2fa478328 --- /dev/null +++ b/libsyscall/wrappers/thread_register_state.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include + +kern_return_t +thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *length, uintptr_t *values) +{ + if (!length) return KERN_INVALID_ARGUMENT; + if (*length > 0 && values == NULL) return KERN_INVALID_ARGUMENT; + + size_t in_length = *length; + size_t out_length = 0; + +#if defined(__i386__) + i386_thread_state_t state = {}; + thread_state_flavor_t flavor = x86_THREAD_STATE32; + mach_msg_type_number_t count = i386_THREAD_STATE_COUNT; +#elif defined(__x86_64__) + x86_thread_state64_t state = {}; + thread_state_flavor_t flavor = x86_THREAD_STATE64; + mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; +#else +#error thread_get_register_pointer_values not defined for this architecture +#endif + + kern_return_t ret = thread_get_state(thread, flavor, (thread_state_t)&state, &count); + if (ret != KERN_SUCCESS){ + return ret; + } + + // If the provided pointer value is > PAGE_SIZE, add it to the output array + // if there's available space. (Values between 0 and PAGE_SIZE are the NULL page + // and not valid pointers.) +#define push_register_value(p) do { \ + if ((uintptr_t)p > PAGE_SIZE) { \ + if (out_length < in_length && values) \ + values[out_length] = p; \ + out_length++; \ + } } while (0) + +#if defined(__i386__) + if (sp) *sp = state.__esp; + + push_register_value(state.__eip); + + push_register_value(state.__eax); + push_register_value(state.__ebx); + push_register_value(state.__ecx); + push_register_value(state.__edx); + push_register_value(state.__edi); + push_register_value(state.__esi); + push_register_value(state.__ebp); +#elif defined(__x86_64__) + if (sp) *sp = state.__rsp - 128 /* redzone */; + + push_register_value(state.__rip); + + push_register_value(state.__rax); + push_register_value(state.__rbx); + push_register_value(state.__rcx); + push_register_value(state.__rdx); + push_register_value(state.__rdi); + push_register_value(state.__rbp); + push_register_value(state.__r8); + push_register_value(state.__r9); + push_register_value(state.__r10); + push_register_value(state.__r11); + push_register_value(state.__r12); + push_register_value(state.__r13); + push_register_value(state.__r14); + push_register_value(state.__r15); +#else +#error thread_get_register_pointer_values not defined for this architecture +#endif + + *length = out_length; + + if (in_length == 0 || out_length > in_length){ + return KERN_INSUFFICIENT_BUFFER_SIZE; + } + + return KERN_SUCCESS; +} diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index 9364707d9..f90bab981 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -77,7 +77,7 @@ MIGS_PRIVATE="" MIGS_DUAL_PUBLIC_PRIVATE="" -if [[ "$PLATFORM_NAME" = "iphoneos" || "$PLATFORM_NAME" = "iphonesimulator" || "$PLATFORM_NAME" = "iphoneosnano" || "$PLATFORM_NAME" = "iphonenanosimulator" || "$PLATFORM_NAME" = "tvos" || "$PLATFOM_NAME" = "tvsimulator" || "$PLATFOM_NAME" = "appletvos" || "$PLATFOM_NAME" = "appletvsimulator" || "$PLATFOM_NAME" = "watchos" || "$PLATFOM_NAME" = "watchsimulator" ]] +if ( echo {iphone,tv,appletv,watch}{os,simulator} iphone{osnano,nanosimulator} | grep -wFq "$PLATFORM_NAME" ) then MIGS_PRIVATE="mach_vm.defs" else @@ -101,9 +101,10 @@ MACH_HDRS="mach.h port_obj.h sync.h vm_task.h - vm_page_size.h" + vm_page_size.h + thread_state.h" -MIG_FILTERS="watchos_prohibited_mig.txt" +MIG_FILTERS="watchos_prohibited_mig.txt tvos_prohibited_mig.txt" # install /usr/include/server headers mkdir -p $SERVER_HEADER_DST diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index a697ebe0f..d4a138af8 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -270,7 +270,9 @@ machine_startup(void) if (PE_parse_boot_argn("debug", &debug_boot_arg, sizeof (debug_boot_arg))) { panicDebugging = TRUE; +#if DEVELOPMENT || DEBUG if (debug_boot_arg & DB_HALT) halt_in_debugger=1; +#endif if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 844b54750..dc528cce1 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -415,7 +415,6 @@ i386_init(void) unsigned int cpus = 0; boolean_t fidn; boolean_t IA32e = TRUE; - char namep[16]; postcode(I386_INIT_ENTRY); @@ -443,9 +442,6 @@ i386_init(void) kernel_debug_string_simple("PE_init_kprintf"); PE_init_kprintf(FALSE); - if(PE_parse_boot_argn("-show_pointers", &namep, sizeof (namep))) - doprnt_hide_pointers = FALSE; - kernel_debug_string_simple("kernel_early_bootstrap"); kernel_early_bootstrap(); diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index ae4a98a94..8d13b3645 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1173,7 +1173,8 @@ mp_cpus_call_action(void) mp_call_head_unlock(cqp, intrs_enabled); KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_ACTION, - call.func, call.arg0, call.arg1, call.maskp, 0); + VM_KERNEL_UNSLIDE(call.func), VM_KERNEL_UNSLIDE_OR_PERM(call.arg0), + VM_KERNEL_UNSLIDE_OR_PERM(call.arg1), VM_KERNEL_ADDRPERM(call.maskp), 0); call.func(call.arg0, call.arg1); (void) mp_call_head_lock(cqp); } @@ -1265,7 +1266,7 @@ mp_cpus_call1( KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL | DBG_FUNC_START, - cpus, mode, VM_KERNEL_UNSLIDE(action_func), arg0, arg1); + cpus, mode, VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1)); if (!smp_initialized) { if ((cpus & CPUMASK_SELF) == 0) @@ -1309,7 +1310,7 @@ mp_cpus_call1( KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_LOCAL, VM_KERNEL_UNSLIDE(action_func), - arg0, arg1, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); action_func(arg0, arg1); } } else { @@ -1375,7 +1376,7 @@ mp_cpus_call1( if (mode != SYNC && call_self ) { KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_LOCAL, - VM_KERNEL_UNSLIDE(action_func), arg0, arg1, 0, 0); + VM_KERNEL_UNSLIDE(action_func), VM_KERNEL_UNSLIDE_OR_PERM(arg0), VM_KERNEL_UNSLIDE_OR_PERM(arg1), 0, 0); if (action_func != NULL) { ml_set_interrupts_enabled(FALSE); action_func(arg0, arg1); diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index 9841a0754..c6352893a 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -1942,7 +1942,8 @@ pmap_change_wiring( PMAP_LOCK(map); if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) - panic("pmap_change_wiring: pte missing"); + panic("pmap_change_wiring(%p,0x%llx,%d): pte missing", + map, vaddr, wired); if (wired && !iswired(*pte)) { /* @@ -2020,26 +2021,26 @@ pmap_map_bd( return(virt); } -unsigned int +mach_vm_size_t pmap_query_resident( pmap_t pmap, addr64_t s64, addr64_t e64, - unsigned int *compressed_count_p) + mach_vm_size_t *compressed_bytes_p) { pt_entry_t *pde; pt_entry_t *spte, *epte; addr64_t l64; uint64_t deadline; - unsigned int result; + mach_vm_size_t resident_bytes; + mach_vm_size_t compressed_bytes; boolean_t is_ept; - unsigned int compressed_count; pmap_intr_assert(); if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) { - if (compressed_count_p) { - *compressed_count_p = 0; + if (compressed_bytes_p) { + *compressed_bytes_p = 0; } return 0; } @@ -2051,8 +2052,8 @@ pmap_query_resident( (uint32_t) (s64 >> 32), s64, (uint32_t) (e64 >> 32), e64); - result = 0; - compressed_count = 0; + resident_bytes = 0; + compressed_bytes = 0; PMAP_LOCK(pmap); @@ -2075,9 +2076,9 @@ pmap_query_resident( for (; spte < epte; spte++) { if (pte_to_pa(*spte) != 0) { - result++; + resident_bytes += PAGE_SIZE; } else if (*spte & PTE_COMPRESSED) { - compressed_count++; + compressed_bytes += PAGE_SIZE; } } @@ -2097,10 +2098,10 @@ pmap_query_resident( PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END, pmap, 0, 0, 0, 0); - if (compressed_count_p) { - *compressed_count_p = compressed_count; + if (compressed_bytes_p) { + *compressed_bytes_p = compressed_bytes; } - return result; + return resident_bytes; } #if MACH_ASSERT diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index b9fcb252b..dbf819f6c 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -603,9 +603,7 @@ __END_DECLS #define MSR_IA32_PP0_ENERGY_STATUS 0x639 #define MSR_IA32_PP1_ENERGY_STATUS 0x641 -#if !defined(XNU_HIDE_SKYLAKE) #define MSR_IA32_IA_PERF_LIMIT_REASONS_SKL 0x64F -#endif #define MSR_IA32_IA_PERF_LIMIT_REASONS 0x690 #define MSR_IA32_GT_PERF_LIMIT_REASONS 0x6B0 diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index e234c3a18..bd48b1da7 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -653,7 +653,7 @@ ipc_importance_task_propagate_helper( assert(IP_VALID(port)); ip_lock(port); temp_task_imp = IIT_NULL; - if (!ipc_port_importance_delta_internal(port, &delta, &temp_task_imp)) { + if (!ipc_port_importance_delta_internal(port, IPID_OPTION_NORMAL, &delta, &temp_task_imp)) { ip_unlock(port); } @@ -2045,6 +2045,276 @@ ipc_importance_disconnect_task(task_t task) task_deallocate(task); } +/* + * Routine: ipc_importance_check_circularity + * Purpose: + * Check if queueing "port" in a message for "dest" + * would create a circular group of ports and messages. + * + * If no circularity (FALSE returned), then "port" + * is changed from "in limbo" to "in transit". + * + * That is, we want to set port->ip_destination == dest, + * but guaranteeing that this doesn't create a circle + * port->ip_destination->ip_destination->... == port + * + * Additionally, if port was successfully changed to "in transit", + * propagate boost assertions from the "in limbo" port to all + * the ports in the chain, and, if the destination task accepts + * boosts, to the destination task. + * + * Conditions: + * No ports locked. References held for "port" and "dest". + */ + +boolean_t +ipc_importance_check_circularity( + ipc_port_t port, + ipc_port_t dest) +{ + ipc_importance_task_t imp_task = IIT_NULL; + ipc_importance_task_t release_imp_task = IIT_NULL; + boolean_t imp_lock_held = FALSE; + int assertcnt = 0; + ipc_port_t base; + + assert(port != IP_NULL); + assert(dest != IP_NULL); + + if (port == dest) + return TRUE; + base = dest; + + /* port is in limbo, so donation status is safe to latch */ + if (port->ip_impdonation != 0) { + imp_lock_held = TRUE; + ipc_importance_lock(); + } + + /* + * First try a quick check that can run in parallel. + * No circularity if dest is not in transit. + */ + ip_lock(port); + + /* + * Even if port is just carrying assertions for others, + * we need the importance lock. + */ + if (port->ip_impcount > 0 && !imp_lock_held) { + if (!ipc_importance_lock_try()) { + ip_unlock(port); + ipc_importance_lock(); + ip_lock(port); + } + imp_lock_held = TRUE; + } + + if (ip_lock_try(dest)) { + if (!ip_active(dest) || + (dest->ip_receiver_name != MACH_PORT_NULL) || + (dest->ip_destination == IP_NULL)) + goto not_circular; + + /* dest is in transit; further checking necessary */ + + ip_unlock(dest); + } + ip_unlock(port); + + /* + * We're about to pay the cost to serialize, + * just go ahead and grab importance lock. + */ + if (!imp_lock_held) { + ipc_importance_lock(); + imp_lock_held = TRUE; + } + + ipc_port_multiple_lock(); /* massive serialization */ + + /* + * Search for the end of the chain (a port not in transit), + * acquiring locks along the way. + */ + + for (;;) { + ip_lock(base); + + if (!ip_active(base) || + (base->ip_receiver_name != MACH_PORT_NULL) || + (base->ip_destination == IP_NULL)) + break; + + base = base->ip_destination; + } + + /* all ports in chain from dest to base, inclusive, are locked */ + + if (port == base) { + /* circularity detected! */ + + ipc_port_multiple_unlock(); + + /* port (== base) is in limbo */ + + assert(ip_active(port)); + assert(port->ip_receiver_name == MACH_PORT_NULL); + assert(port->ip_destination == IP_NULL); + + while (dest != IP_NULL) { + ipc_port_t next; + + /* dest is in transit or in limbo */ + + assert(ip_active(dest)); + assert(dest->ip_receiver_name == MACH_PORT_NULL); + + next = dest->ip_destination; + ip_unlock(dest); + dest = next; + } + + if (imp_lock_held) + ipc_importance_unlock(); + + return TRUE; + } + + /* + * The guarantee: lock port while the entire chain is locked. + * Once port is locked, we can take a reference to dest, + * add port to the chain, and unlock everything. + */ + + ip_lock(port); + ipc_port_multiple_unlock(); + + not_circular: + + /* port is in limbo */ + + assert(ip_active(port)); + assert(port->ip_receiver_name == MACH_PORT_NULL); + assert(port->ip_destination == IP_NULL); + + ip_reference(dest); + port->ip_destination = dest; + + /* must have been in limbo or still bound to a task */ + assert(port->ip_tempowner != 0); + + /* + * We delayed dropping assertions from a specific task. + * Cache that info now (we'll drop assertions and the + * task reference below). + */ + release_imp_task = port->ip_imp_task; + if (IIT_NULL != release_imp_task) { + port->ip_imp_task = IIT_NULL; + } + assertcnt = port->ip_impcount; + + /* take the port out of limbo w.r.t. assertions */ + port->ip_tempowner = 0; + + /* now unlock chain */ + + ip_unlock(port); + + for (;;) { + + /* every port along chain track assertions behind it */ + ipc_port_impcount_delta(dest, assertcnt, base); + + if (dest == base) + break; + + /* port is in transit */ + + assert(ip_active(dest)); + assert(dest->ip_receiver_name == MACH_PORT_NULL); + assert(dest->ip_destination != IP_NULL); + assert(dest->ip_tempowner == 0); + + port = dest->ip_destination; + ip_unlock(dest); + dest = port; + } + + /* base is not in transit */ + assert(!ip_active(base) || + (base->ip_receiver_name != MACH_PORT_NULL) || + (base->ip_destination == IP_NULL)); + + /* + * Find the task to boost (if any). + * We will boost "through" ports that don't know + * about inheritance to deliver receive rights that + * do. + */ + if (ip_active(base) && (assertcnt > 0)) { + assert(imp_lock_held); + if (base->ip_tempowner != 0) { + if (IIT_NULL != base->ip_imp_task) { + /* specified tempowner task */ + imp_task = base->ip_imp_task; + assert(ipc_importance_task_is_any_receiver_type(imp_task)); + } + /* otherwise don't boost current task */ + + } else if (base->ip_receiver_name != MACH_PORT_NULL) { + ipc_space_t space = base->ip_receiver; + + /* only spaces with boost-accepting tasks */ + if (space->is_task != TASK_NULL && + ipc_importance_task_is_any_receiver_type(space->is_task->task_imp_base)) + imp_task = space->is_task->task_imp_base; + } + + /* take reference before unlocking base */ + if (imp_task != IIT_NULL) { + ipc_importance_task_reference(imp_task); + } + } + + ip_unlock(base); + + /* + * Transfer assertions now that the ports are unlocked. + * Avoid extra overhead if transferring to/from the same task. + * + * NOTE: If a transfer is occurring, the new assertions will + * be added to imp_task BEFORE the importance lock is unlocked. + * This is critical - to avoid decrements coming from the kmsgs + * beating the increment to the task. + */ + boolean_t transfer_assertions = (imp_task != release_imp_task); + + if (imp_task != IIT_NULL) { + assert(imp_lock_held); + if (transfer_assertions) + ipc_importance_task_hold_internal_assertion_locked(imp_task, assertcnt); + } + + if (release_imp_task != IIT_NULL) { + assert(imp_lock_held); + if (transfer_assertions) + ipc_importance_task_drop_internal_assertion_locked(release_imp_task, assertcnt); + } + + if (imp_lock_held) + ipc_importance_unlock(); + + if (imp_task != IIT_NULL) + ipc_importance_task_release(imp_task); + + if (release_imp_task != IIT_NULL) + ipc_importance_task_release(release_imp_task); + + return FALSE; +} + /* * Routine: ipc_importance_send * Purpose: @@ -2067,7 +2337,6 @@ ipc_importance_send( ipc_importance_task_t task_imp; kern_return_t kr; - assert(IP_VALID(port)); /* If no donation to be made, return quickly */ @@ -2169,13 +2438,12 @@ ipc_importance_send( /* * If we need to relock the port, do it with the importance still locked. * This assures we get to add the importance boost through the port to - * the task BEFORE anyone else can attempt to undo that operation because + * the task BEFORE anyone else can attempt to undo that operation if * the sender lost donor status. */ if (TRUE == port_lock_dropped) { ip_lock(port); } - ipc_importance_unlock(); portupdate: @@ -2190,11 +2458,36 @@ ipc_importance_send( } #endif /* IMPORTANCE_DEBUG */ - /* adjust port boost count (with port locked) */ - if (TRUE == ipc_port_importance_delta(port, 1)) { + mach_port_delta_t delta = 1; + boolean_t need_port_lock; + task_imp = IIT_NULL; + + /* adjust port boost count (with importance and port locked) */ + need_port_lock = ipc_port_importance_delta_internal(port, IPID_OPTION_NORMAL, &delta, &task_imp); + + /* if we need to adjust a task importance as a result, apply that here */ + if (IIT_NULL != task_imp && delta != 0) { + assert(delta == 1); + + /* if this results in a change of state, propagate the transistion */ + if (ipc_importance_task_check_transition(task_imp, IIT_UPDATE_HOLD, delta)) { + + /* can't hold the port lock during task transition(s) */ + if (!need_port_lock) { + need_port_lock = TRUE; + ip_unlock(port); + } + ipc_importance_task_propagate_assertion_locked(task_imp, IIT_UPDATE_HOLD, TRUE); + } + } + + ipc_importance_unlock(); + + if (need_port_lock) { port_lock_dropped = TRUE; ip_lock(port); } + return port_lock_dropped; } @@ -2449,7 +2742,12 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) ipc_importance_unlock(); } - /* decrement port boost count */ + /* + * decrement port boost count + * This is OK to do without the importance lock as we atomically + * unlinked the kmsg and snapshot the donating state while holding + * the importance lock + */ if (donating) { ip_lock(port); if (III_NULL != inherit) { @@ -2458,14 +2756,14 @@ ipc_importance_inherit_from(ipc_kmsg_t kmsg) ip_unlock(port); } else { /* drop importance from port and destination task */ - if (ipc_port_importance_delta(port, -1) == FALSE) { + if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { ip_unlock(port); } } } else if (cleared_self_donation) { ip_lock(port); /* drop cleared donation from port and destination task */ - if (ipc_port_importance_delta(port, -1) == FALSE) { + if (ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { ip_unlock(port); } } @@ -2588,7 +2886,6 @@ ipc_importance_receive( ipc_importance_task_t task_imp = task_self->task_imp_base; ipc_port_t port = kmsg->ikm_header->msgh_remote_port; - /* defensive deduction for release builds lacking the assert */ ip_lock(port); ipc_port_impcount_delta(port, -1, IP_NULL); ip_unlock(port); @@ -2688,7 +2985,7 @@ ipc_importance_clean( ip_lock(port); /* inactive ports already had their importance boosts dropped */ if (!ip_active(port) || - ipc_port_importance_delta(port, -1) == FALSE) { + ipc_port_importance_delta(port, IPID_OPTION_NORMAL, -1) == FALSE) { ip_unlock(port); } } diff --git a/osfmk/ipc/ipc_importance.h b/osfmk/ipc/ipc_importance.h index 3b009b42d..15ad62d66 100644 --- a/osfmk/ipc/ipc_importance.h +++ b/osfmk/ipc/ipc_importance.h @@ -224,6 +224,8 @@ extern kern_return_t ipc_importance_task_drop_file_lock_assertion(ipc_importance extern kern_return_t ipc_importance_task_hold_legacy_external_assertion(ipc_importance_task_t task_imp, uint32_t count); extern kern_return_t ipc_importance_task_drop_legacy_external_assertion(ipc_importance_task_t task_imp, uint32_t count); +extern boolean_t ipc_importance_check_circularity(ipc_port_t port, ipc_port_t dest); + /* prepare importance attributes for sending */ extern boolean_t ipc_importance_send( ipc_kmsg_t kmsg, diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 2db958d4d..f5737416a 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -1375,6 +1375,7 @@ ipc_kmsg_send( ipc_port_t port; thread_t th = current_thread(); mach_msg_return_t error = MACH_MSG_SUCCESS; + boolean_t kernel_reply = FALSE; spl_t s; /* Check if honor qlimit flag is set on thread. */ @@ -1445,6 +1446,7 @@ ipc_kmsg_send( assert(IP_VALID(port)); ip_lock(port); /* fall thru with reply - same options */ + kernel_reply = TRUE; } #if IMPORTANCE_INHERITANCE @@ -1520,6 +1522,18 @@ ipc_kmsg_send( ipc_kmsg_destroy(kmsg); return MACH_MSG_SUCCESS; } + + if (error != MACH_MSG_SUCCESS && kernel_reply) { + /* + * Kernel reply messages that fail can't be allowed to + * pseudo-receive on error conditions. We need to just treat + * the message as a successful delivery. + */ + ip_release(port); /* JMM - Future: release right, not just ref */ + kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; + ipc_kmsg_destroy(kmsg); + return MACH_MSG_SUCCESS; + } return error; } @@ -2017,7 +2031,7 @@ ipc_kmsg_copyin_header( ipc_port_t dport = (ipc_port_t)dest_port; /* dport still locked from above */ - if (ipc_port_importance_delta(dport, 1) == FALSE) { + if (ipc_port_importance_delta(dport, IPID_OPTION_SENDPOSSIBLE, 1) == FALSE) { ip_unlock(dport); } } diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index e8fbd9449..9a580b9e6 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -205,7 +205,6 @@ ipc_port_request_alloc( if (port->ip_impdonation != 0 && port->ip_spimportant == 0 && (task_is_importance_donor(current_task()))) { - port->ip_spimportant = 1; *importantp = TRUE; } #endif /* IMPORTANCE_INHERTANCE */ @@ -383,7 +382,6 @@ ipc_port_request_sparm( (port->ip_spimportant == 0) && (((option & MACH_SEND_IMPORTANCE) != 0) || (task_is_importance_donor(current_task())))) { - port->ip_spimportant = 1; return TRUE; } #else @@ -994,11 +992,6 @@ ipc_port_destroy( * but guaranteeing that this doesn't create a circle * port->ip_destination->ip_destination->... == port * - * Additionally, if port was successfully changed to "in transit", - * propagate boost assertions from the "in limbo" port to all - * the ports in the chain, and, if the destination task accepts - * boosts, to the destination task. - * * Conditions: * No ports locked. References held for "port" and "dest". */ @@ -1008,13 +1001,11 @@ ipc_port_check_circularity( ipc_port_t port, ipc_port_t dest) { - ipc_port_t base; - #if IMPORTANCE_INHERITANCE - ipc_importance_task_t imp_task = IIT_NULL; - ipc_importance_task_t release_imp_task = IIT_NULL; - int assertcnt = 0; -#endif /* IMPORTANCE_INHERITANCE */ + /* adjust importance counts at the same time */ + return ipc_importance_check_circularity(port, dest); +#else + ipc_port_t base; assert(port != IP_NULL); assert(dest != IP_NULL); @@ -1027,7 +1018,6 @@ ipc_port_check_circularity( * First try a quick check that can run in parallel. * No circularity if dest is not in transit. */ - ip_lock(port); if (ip_lock_try(dest)) { if (!ip_active(dest) || @@ -1108,37 +1098,11 @@ ipc_port_check_circularity( ip_reference(dest); port->ip_destination = dest; -#if IMPORTANCE_INHERITANCE - /* must have been in limbo or still bound to a task */ - assert(port->ip_tempowner != 0); - - /* - * We delayed dropping assertions from a specific task. - * Cache that info now (we'll drop assertions and the - * task reference below). - */ - release_imp_task = port->ip_imp_task; - if (IIT_NULL != release_imp_task) { - port->ip_imp_task = IIT_NULL; - } - assertcnt = port->ip_impcount; - - /* take the port out of limbo w.r.t. assertions */ - port->ip_tempowner = 0; - -#endif /* IMPORTANCE_INHERITANCE */ - /* now unlock chain */ ip_unlock(port); for (;;) { - -#if IMPORTANCE_INHERITANCE - /* every port along chain track assertions behind it */ - dest->ip_impcount += assertcnt; -#endif /* IMPORTANCE_INHERITANCE */ - if (dest == base) break; @@ -1148,10 +1112,6 @@ ipc_port_check_circularity( assert(dest->ip_receiver_name == MACH_PORT_NULL); assert(dest->ip_destination != IP_NULL); -#if IMPORTANCE_INHERITANCE - assert(dest->ip_tempowner == 0); -#endif /* IMPORTANCE_INHERITANCE */ - port = dest->ip_destination; ip_unlock(dest); dest = port; @@ -1162,63 +1122,10 @@ ipc_port_check_circularity( (base->ip_receiver_name != MACH_PORT_NULL) || (base->ip_destination == IP_NULL)); -#if IMPORTANCE_INHERITANCE - /* - * Find the task to boost (if any). - * We will boost "through" ports that don't know - * about inheritance to deliver receive rights that - * do. - */ - if (ip_active(base) && (assertcnt > 0)) { - if (base->ip_tempowner != 0) { - if (IIT_NULL != base->ip_imp_task) { - /* specified tempowner task */ - imp_task = base->ip_imp_task; - assert(ipc_importance_task_is_any_receiver_type(imp_task)); - } - /* otherwise don't boost current task */ - - } else if (base->ip_receiver_name != MACH_PORT_NULL) { - ipc_space_t space = base->ip_receiver; - - /* only spaces with boost-accepting tasks */ - if (space->is_task != TASK_NULL && - ipc_importance_task_is_any_receiver_type(space->is_task->task_imp_base)) - imp_task = space->is_task->task_imp_base; - } - - /* take reference before unlocking base */ - if (imp_task != IIT_NULL) { - ipc_importance_task_reference(imp_task); - } - } -#endif /* IMPORTANCE_INHERITANCE */ - ip_unlock(base); -#if IMPORTANCE_INHERITANCE - /* - * Transfer assertions now that the ports are unlocked. - * Avoid extra overhead if transferring to/from the same task. - */ - boolean_t transfer_assertions = (imp_task != release_imp_task) ? TRUE : FALSE; - - if (imp_task != IIT_NULL) { - if (transfer_assertions) - ipc_importance_task_hold_internal_assertion(imp_task, assertcnt); - ipc_importance_task_release(imp_task); - imp_task = IIT_NULL; - } - - if (release_imp_task != IIT_NULL) { - if (transfer_assertions) - ipc_importance_task_drop_internal_assertion(release_imp_task, assertcnt); - ipc_importance_task_release(release_imp_task); - release_imp_task = IIT_NULL; - } -#endif /* IMPORTANCE_INHERITANCE */ - return FALSE; +#endif /* !IMPORTANCE_INHERITANCE */ } /* @@ -1255,14 +1162,12 @@ ipc_port_impcount_delta( } absdelta = 0 - delta; - //assert(port->ip_impcount >= absdelta); - /* if we have enough to deduct, we're done */ if (port->ip_impcount >= absdelta) { port->ip_impcount -= absdelta; return delta; } -#if DEVELOPMENT || DEBUG +#if (DEVELOPMENT || DEBUG) if (port->ip_receiver_name != MACH_PORT_NULL) { task_t target_task = port->ip_receiver->is_task; ipc_importance_task_t target_imp = target_task->task_imp_base; @@ -1279,7 +1184,7 @@ ipc_port_impcount_delta( printf("Over-release of importance assertions for port 0x%x receiver pid %d (%s), " "dropping %d assertion(s) but port only has %d remaining.\n", port->ip_receiver_name, - target_imp->iit_bsd_pid, target_imp->iit_procname, + target_pid, target_procname, absdelta, port->ip_impcount); } else if (base != IP_NULL) { @@ -1295,14 +1200,16 @@ ipc_port_impcount_delta( target_procname = "unknown"; target_pid = -1; } - printf("Over-release of importance assertions for port %p " + printf("Over-release of importance assertions for port 0x%lx " "enqueued on port 0x%x with receiver pid %d (%s), " "dropping %d assertion(s) but port only has %d remaining.\n", - port, base->ip_receiver_name, - target_imp->iit_bsd_pid, target_imp->iit_procname, + (unsigned long)VM_KERNEL_UNSLIDE_OR_PERM((uintptr_t)port), + base->ip_receiver_name, + target_pid, target_procname, absdelta, port->ip_impcount); } #endif + delta = 0 - port->ip_impcount; port->ip_impcount = 0; return delta; @@ -1318,6 +1225,7 @@ ipc_port_impcount_delta( * and if so, apply the delta. * Conditions: * The port is referenced and locked on entry. + * Importance may be locked. * Nothing else is locked. * The lock may be dropped on exit. * Returns TRUE if lock was dropped. @@ -1327,6 +1235,7 @@ ipc_port_impcount_delta( boolean_t ipc_port_importance_delta_internal( ipc_port_t port, + natural_t options, mach_port_delta_t *deltap, ipc_importance_task_t *imp_task) { @@ -1338,6 +1247,8 @@ ipc_port_importance_delta_internal( if (*deltap == 0) return FALSE; + assert(options == IPID_OPTION_NORMAL || options == IPID_OPTION_SENDPOSSIBLE); + base = port; /* if port is in transit, have to search for end of chain */ @@ -1361,21 +1272,27 @@ ipc_port_importance_delta_internal( ipc_port_multiple_unlock(); } - /* unlock down to the base, adding a boost at each level */ + /* + * If the port lock is dropped b/c the port is in transit, there is a + * race window where another thread can drain messages and/or fire a + * send possible notification before we get here. + * + * We solve this race by checking to see if our caller armed the send + * possible notification, whether or not it's been fired yet, and + * whether or not we've already set the port's ip_spimportant bit. If + * we don't need a send-possible boost, then we'll just apply a + * harmless 0-boost to the port. + */ + if (options & IPID_OPTION_SENDPOSSIBLE) { + assert(*deltap == 1); + if (port->ip_sprequests && port->ip_spimportant == 0) + port->ip_spimportant = 1; + else + *deltap = 0; + } + + /* unlock down to the base, adjusting boost(s) at each level */ for (;;) { - /* - * JMM TODO - because of the port unlock to grab the multiple lock - * above, a subsequent drop of importance could race and beat - * the "previous" increase - causing the port impcount to go - * negative briefly. The defensive deduction performed by - * ipc_port_impcount_delta() defeats that, and therefore can - * cause an importance leak once the increase finally arrives. - * - * Need to rework the importance delta logic to be more like - * ipc_importance_inherit_from() where it locks all it needs in - * one pass to avoid any lock drops - to keep that race from - * ever occuring. - */ *deltap = ipc_port_impcount_delta(port, *deltap, base); if (port == base) { @@ -1444,20 +1361,19 @@ ipc_port_importance_delta_internal( boolean_t ipc_port_importance_delta( ipc_port_t port, + natural_t options, mach_port_delta_t delta) { ipc_importance_task_t imp_task = IIT_NULL; boolean_t dropped; - dropped = ipc_port_importance_delta_internal(port, &delta, &imp_task); + dropped = ipc_port_importance_delta_internal(port, options, &delta, &imp_task); - if (IIT_NULL == imp_task) + if (IIT_NULL == imp_task || delta == 0) return dropped; - if (!dropped) { - dropped = TRUE; + if (!dropped) ip_unlock(port); - } assert(ipc_importance_task_is_any_receiver_type(imp_task)); @@ -1467,7 +1383,7 @@ ipc_port_importance_delta( ipc_importance_task_drop_internal_assertion(imp_task, -delta); ipc_importance_task_release(imp_task); - return dropped; + return TRUE; } #endif /* IMPORTANCE_INHERITANCE */ diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 48a2fc49d..92bb0e70a 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -432,6 +432,12 @@ ipc_port_check_circularity( ipc_port_t dest); #if IMPORTANCE_INHERITANCE + +enum { + IPID_OPTION_NORMAL = 0, /* normal boost */ + IPID_OPTION_SENDPOSSIBLE = 1, /* send-possible induced boost */ +}; + /* apply importance delta to port only */ extern mach_port_delta_t ipc_port_impcount_delta( @@ -443,13 +449,15 @@ ipc_port_impcount_delta( extern boolean_t ipc_port_importance_delta_internal( ipc_port_t port, - mach_port_delta_t *delta, + natural_t options, + mach_port_delta_t *deltap, ipc_importance_task_t *imp_task); /* Apply an importance delta to a port and reflect change in receiver task */ extern boolean_t ipc_port_importance_delta( ipc_port_t port, + natural_t options, mach_port_delta_t delta); #endif /* IMPORTANCE_INHERITANCE */ diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index b8feb54dc..bf655396e 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -378,7 +378,7 @@ ipc_right_request_alloc( #if IMPORTANCE_INHERITANCE if (needboost == TRUE) { - if (ipc_port_importance_delta(port, 1) == FALSE) + if (ipc_port_importance_delta(port, IPID_OPTION_SENDPOSSIBLE, 1) == FALSE) ip_unlock(port); } else #endif /* IMPORTANCE_INHERITANCE */ diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index 43880d7f9..47e9bdf3a 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -1984,7 +1984,9 @@ mach_voucher_extract_attr_content( * manager referenced during the callout. */ ivgt_lookup(key_index, FALSE, &manager, NULL); - assert(IVAM_NULL != manager); + if (IVAM_NULL == manager) { + return KERN_INVALID_ARGUMENT; + } /* * Get the value(s) to pass to the manager @@ -2060,7 +2062,9 @@ mach_voucher_extract_attr_recipe( * manager referenced during the callout. */ ivgt_lookup(key_index, FALSE, &manager, NULL); - assert(IVAM_NULL != manager); + if (IVAM_NULL == manager) { + return KERN_INVALID_ARGUMENT; + } /* * Get the value(s) to pass to the manager @@ -2126,9 +2130,6 @@ mach_voucher_extract_all_attr_recipes( if (recipe_size - recipe_used < sizeof(*recipe)) return KERN_NO_SPACE; - recipe = (mach_voucher_attr_recipe_t)(void *)&recipes[recipe_used]; - content_size = recipe_size - recipe_used - sizeof(*recipe); - /* * Get the manager for this key_index. The * existence of a non-default value for this @@ -2137,6 +2138,12 @@ mach_voucher_extract_all_attr_recipes( */ ivgt_lookup(key_index, FALSE, &manager, NULL); assert(IVAM_NULL != manager); + if (IVAM_NULL == manager) { + continue; + } + + recipe = (mach_voucher_attr_recipe_t)(void *)&recipes[recipe_used]; + content_size = recipe_size - recipe_used - sizeof(*recipe); /* * Get the value(s) to pass to the manager @@ -2266,7 +2273,9 @@ mach_voucher_attr_command( * execution. */ ivgt_lookup(key_index, TRUE, &manager, &control); - assert(IVAM_NULL != manager); + if (IVAM_NULL == manager) { + return KERN_INVALID_ARGUMENT; + } /* * Get the values for this pair diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index a5cf30c93..a21666275 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -51,6 +51,7 @@ */ int coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz); boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal); +task_t coalition_get_leader(coalition_t coal); int coalition_get_task_count(coalition_t coal); uint64_t coalition_get_page_count(coalition_t coal, int *ntasks); int coalition_get_pid_list(coalition_t coal, uint32_t rolemask, int sort_order, @@ -168,6 +169,10 @@ struct i_resource_coalition { uint64_t bytesread; uint64_t byteswritten; uint64_t gpu_time; + uint64_t logical_immediate_writes; + uint64_t logical_deferred_writes; + uint64_t logical_invalidated_writes; + uint64_t logical_metadata_writes; uint64_t task_count; /* tasks that have started in this coalition */ uint64_t dead_task_count; /* tasks that have exited in this coalition; @@ -385,6 +390,10 @@ i_coal_resource_remove_task(coalition_t coal, task_t task) cr->bytesread += task->task_io_stats->disk_reads.size; cr->byteswritten += task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size; cr->gpu_time += task_gpu_utilisation(task); + cr->logical_immediate_writes += task->task_immediate_writes; + cr->logical_deferred_writes += task->task_deferred_writes; + cr->logical_invalidated_writes += task->task_invalidated_writes; + cr->logical_metadata_writes += task->task_metadata_writes; /* remove the task from the coalition's list */ remqueue(&task->task_coalition[COALITION_TYPE_RESOURCE]); @@ -451,6 +460,10 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us uint64_t bytesread = coal->r.bytesread; uint64_t byteswritten = coal->r.byteswritten; uint64_t gpu_time = coal->r.gpu_time; + uint64_t logical_immediate_writes = coal->r.logical_immediate_writes; + uint64_t logical_deferred_writes = coal->r.logical_deferred_writes; + uint64_t logical_invalidated_writes = coal->r.logical_invalidated_writes; + uint64_t logical_metadata_writes = coal->r.logical_metadata_writes; int64_t cpu_time_billed_to_me = 0; int64_t cpu_time_billed_to_others = 0; @@ -482,6 +495,10 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us bytesread += task->task_io_stats->disk_reads.size; byteswritten += task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size; gpu_time += task_gpu_utilisation(task); + logical_immediate_writes += task->task_immediate_writes; + logical_deferred_writes += task->task_deferred_writes; + logical_invalidated_writes += task->task_invalidated_writes; + logical_metadata_writes += task->task_metadata_writes; cpu_time_billed_to_me += (int64_t)bank_billed_time(task->bank_context); cpu_time_billed_to_others += (int64_t)bank_serviced_time(task->bank_context); } @@ -522,6 +539,10 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us cru_out->bytesread = bytesread; cru_out->byteswritten = byteswritten; cru_out->gpu_time = gpu_time; + cru_out->logical_immediate_writes = logical_immediate_writes; + cru_out->logical_deferred_writes = logical_deferred_writes; + cru_out->logical_invalidated_writes = logical_invalidated_writes; + cru_out->logical_metadata_writes = logical_metadata_writes; ledger_dereference(sum_ledger); sum_ledger = LEDGER_NULL; @@ -1571,6 +1592,27 @@ boolean_t coalition_is_leader(task_t task, int coal_type, coalition_t *coal) } +task_t coalition_get_leader(coalition_t coal) +{ + task_t leader = TASK_NULL; + + if (!coal) + return TASK_NULL; + + coalition_lock(coal); + if (coal->type != COALITION_TYPE_JETSAM) + goto out_unlock; + + leader = coal->j.leader; + if (leader != TASK_NULL) + task_reference(leader); + +out_unlock: + coalition_unlock(coal); + return leader; +} + + int coalition_get_task_count(coalition_t coal) { int ntasks = 0; diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 2c79aacdf..cd9b5bb23 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -317,7 +317,8 @@ panic_prologue(const char *str) nestedpanic +=1; PANIC_UNLOCK(); Debugger("double panic"); - printf("double panic: We are hanging here...\n"); + // a printf statement here was removed to avoid a panic-loop caused + // by a panic from printf panic_stop(); /* NOTREACHED */ } @@ -341,12 +342,16 @@ panic_epilogue(spl_t s) panicstr = (char *)0; PANIC_UNLOCK(); +#if DEVELOPMENT || DEBUG if (return_on_panic) { panic_normal(); enable_preemption(); splx(s); return; } +#else + (void)s; +#endif kdb_printf("panic: We are hanging here...\n"); panic_stop(); /* NOTREACHED */ diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 1789ae5f6..7aa0466db 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -266,6 +266,7 @@ ipc_kobject_server( ipc_kmsg_t reply; kern_return_t kr; ipc_port_t *destp; + ipc_port_t replyp = IPC_PORT_NULL; mach_msg_format_0_trailer_t *trailer; register mig_hash_t *ptr; @@ -341,10 +342,10 @@ ipc_kobject_server( } else { if (!ipc_kobject_notify(request->ikm_header, reply->ikm_header)){ -#if MACH_IPC_TEST +#if DEVELOPMENT || DEBUG printf("ipc_kobject_server: bogus kernel message, id=%d\n", request->ikm_header->msgh_id); -#endif /* MACH_IPC_TEST */ +#endif /* DEVELOPMENT || DEBUG */ _MIG_MSGID_INVALID(request->ikm_header->msgh_id); ((mig_reply_error_t *) reply->ikm_header)->RetCode @@ -419,6 +420,8 @@ ipc_kobject_server( ipc_kmsg_destroy(request); } + replyp = (ipc_port_t)reply->ikm_header->msgh_remote_port; + if (kr == MIG_NO_REPLY) { /* * The server function will send a reply message @@ -428,7 +431,7 @@ ipc_kobject_server( ipc_kmsg_free(reply); return IKM_NULL; - } else if (!IP_VALID((ipc_port_t)reply->ikm_header->msgh_remote_port)) { + } else if (!IP_VALID(replyp)) { /* * Can't queue the reply message if the destination * (the reply port) isn't valid. @@ -436,6 +439,17 @@ ipc_kobject_server( ipc_kmsg_destroy(reply); + return IKM_NULL; + } else if (replyp->ip_receiver == ipc_space_kernel) { + /* + * Don't send replies to kobject kernel ports + */ +#if DEVELOPMENT || DEBUG + printf("%s: refusing to send reply to kobject %d port (id:%d)\n", + __func__, ip_kotype(replyp), + request->ikm_header->msgh_id); +#endif /* DEVELOPMENT || DEBUG */ + ipc_kmsg_destroy(reply); return IKM_NULL; } @@ -528,9 +542,22 @@ ipc_kobject_notify( mach_msg_header_t *request_header, mach_msg_header_t *reply_header) { + mach_msg_max_trailer_t * trailer; ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port; ((mig_reply_error_t *) reply_header)->RetCode = MIG_NO_REPLY; + + trailer = (mach_msg_max_trailer_t *) + ((vm_offset_t)request_header + request_header->msgh_size); + if (0 != bcmp(&trailer->msgh_audit, &KERNEL_AUDIT_TOKEN, + sizeof(trailer->msgh_audit))) { + return FALSE; + } + if (0 != bcmp(&trailer->msgh_sender, &KERNEL_SECURITY_TOKEN, + sizeof(trailer->msgh_sender))) { + return FALSE; + } + switch (request_header->msgh_id) { case MACH_NOTIFY_NO_SENDERS: switch (ip_kotype(port)) { diff --git a/osfmk/kern/kpc_common.c b/osfmk/kern/kpc_common.c index c091eb115..26bdae8f2 100644 --- a/osfmk/kern/kpc_common.c +++ b/osfmk/kern/kpc_common.c @@ -412,6 +412,7 @@ kpc_get_config(uint32_t classes, kpc_config_t *current_config) int kpc_set_config(uint32_t classes, kpc_config_t *configv) { + int ret = 0; struct kpc_config_remote mp_config = { .classes = classes, .configv = configv, .pmc_mask = kpc_get_configurable_pmc_mask(classes) @@ -437,11 +438,11 @@ kpc_set_config(uint32_t classes, kpc_config_t *configv) if (classes & KPC_CLASS_POWER_MASK) mp_config.classes |= KPC_CLASS_CONFIGURABLE_MASK; - kpc_set_config_arch( &mp_config ); + ret = kpc_set_config_arch( &mp_config ); lck_mtx_unlock(&kpc_config_lock); - return 0; + return ret; } /* allocate a buffer large enough for all possible counters */ diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 3deffa426..dfe33564d 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -151,10 +151,12 @@ host_reboot( assert(host_priv == &realhost); +#if DEVELOPMENT || DEBUG if (options & HOST_REBOOT_DEBUGGER) { Debugger("Debugger"); return (KERN_SUCCESS); } +#endif if (options & HOST_REBOOT_UPSDELAY) { // UPS power cutoff path diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 9a0a9427c..5ee363dc7 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -896,7 +896,7 @@ assert_wait( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), 0, 0, 0, 0); struct waitq *waitq; waitq = global_eventq(event); @@ -929,7 +929,7 @@ assert_wait_timeout( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, @@ -978,7 +978,7 @@ assert_wait_timeout_with_leeway( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, @@ -1013,7 +1013,7 @@ assert_wait_deadline( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, @@ -1049,7 +1049,7 @@ assert_wait_deadline_with_leeway( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(event), interruptible, deadline, 0, 0); wresult = waitq_assert_wait64_locked(waitq, CAST_EVENT64_T(event), interruptible, diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index 53013fa79..eada1fb64 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -284,6 +284,11 @@ kernel_bootstrap(void) csr_init(); #endif + if (PE_i_can_has_debugger(NULL) && + PE_parse_boot_argn("-show_pointers", &namep, sizeof (namep))) { + doprnt_hide_pointers = FALSE; + } + kernel_bootstrap_log("stackshot_lock_init"); stackshot_lock_init(); @@ -548,12 +553,12 @@ kernel_bootstrap_thread(void) vm_commpage_init(); vm_commpage_text_init(); - #if CONFIG_MACF kernel_bootstrap_log("mac_policy_initmach"); mac_policy_initmach(); #endif + #if CONFIG_SCHED_SFI kernel_bootstrap_log("sfi_init"); sfi_init(); diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 0cba287b0..4304559e6 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -206,37 +206,29 @@ semaphore_create( /* * Routine: semaphore_destroy_internal * - * This call will only succeed if the specified task is the SAME task - * specified at the semaphore's creation. + * Disassociate a semaphore from its owning task, mark it inactive, + * and set any waiting threads running with THREAD_RESTART. * - * All threads currently blocked on the semaphore are awoken. These - * threads will return with the KERN_TERMINATED error. + * Conditions: + * task is locked + * semaphore is locked + * semaphore is owned by the specified task + * Returns: + * with semaphore unlocked */ -kern_return_t +static void semaphore_destroy_internal( task_t task, semaphore_t semaphore) { int old_count; - spl_t spl_level; - - /* - * Disown semaphore - */ - task_lock(task); - if (semaphore->owner != task) { - task_unlock(task); - return KERN_INVALID_ARGUMENT; - } - spl_level = splsched(); - semaphore_lock(semaphore); + /* unlink semaphore from owning task */ + assert(semaphore->owner == task); remqueue((queue_entry_t) semaphore); semaphore->owner = TASK_NULL; task->semaphores_owned--; - task_unlock(task); - /* * Deactivate semaphore */ @@ -259,9 +251,6 @@ semaphore_destroy_internal( } else { semaphore_unlock(semaphore); } - splx(spl_level); - - return KERN_SUCCESS; } /* @@ -275,18 +264,75 @@ semaphore_destroy( task_t task, semaphore_t semaphore) { - kern_return_t kr; + spl_t spl_level; if (semaphore == SEMAPHORE_NULL) return KERN_INVALID_ARGUMENT; if (task == TASK_NULL) { - kr = KERN_INVALID_ARGUMENT; - } else { - kr = semaphore_destroy_internal(task, semaphore); + semaphore_dereference(semaphore); + return KERN_INVALID_ARGUMENT; + } + + task_lock(task); + spl_level = splsched(); + semaphore_lock(semaphore); + + if (semaphore->owner != task) { + semaphore_unlock(semaphore); + splx(spl_level); + task_unlock(task); + return KERN_INVALID_ARGUMENT; } + + semaphore_destroy_internal(task, semaphore); + /* semaphore unlocked */ + + splx(spl_level); + task_unlock(task); + semaphore_dereference(semaphore); - return kr; + return KERN_SUCCESS; +} + +/* + * Routine: semaphore_destroy_all + * + * Destroy all the semaphores associated with a given task. + */ +#define SEMASPERSPL 20 /* max number of semaphores to destroy per spl hold */ + +void +semaphore_destroy_all( + task_t task) +{ + uint32_t count; + spl_t spl_level; + + count = 0; + task_lock(task); + while (!queue_empty(&task->semaphore_list)) { + semaphore_t semaphore; + + semaphore = (semaphore_t) queue_first(&task->semaphore_list); + + if (count == 0) + spl_level = splsched(); + semaphore_lock(semaphore); + + semaphore_destroy_internal(task, semaphore); + /* semaphore unlocked */ + + /* throttle number of semaphores per interrupt disablement */ + if (++count == SEMASPERSPL) { + count = 0; + splx(spl_level); + } + } + if (count != 0) + splx(spl_level); + + task_unlock(task); } /* @@ -1072,6 +1118,9 @@ void semaphore_dereference( semaphore_t semaphore) { + uint32_t collisions; + spl_t spl_level; + if (semaphore == NULL) return; @@ -1090,10 +1139,37 @@ semaphore_dereference( assert(!port->ip_srights); ipc_port_dealloc_kernel(port); } - if (semaphore->active) { - assert(semaphore->owner != TASK_NULL); - semaphore_destroy_internal(semaphore->owner, semaphore); + + /* + * Lock the semaphore to lock in the owner task reference. + * Then continue to try to lock the task (inverse order). + */ + spl_level = splsched(); + semaphore_lock(semaphore); + for (collisions = 0; semaphore->active; collisions++) { + task_t task = semaphore->owner; + + assert(task != TASK_NULL); + + if (task_lock_try(task)) { + semaphore_destroy_internal(task, semaphore); + /* semaphore unlocked */ + splx(spl_level); + task_unlock(task); + goto out; + } + + /* failed to get out-of-order locks */ + semaphore_unlock(semaphore); + splx(spl_level); + mutex_pause(collisions); + spl_level = splsched(); + semaphore_lock(semaphore); } + semaphore_unlock(semaphore); + splx(spl_level); + + out: zfree(semaphore_zone, semaphore); } diff --git a/osfmk/kern/sync_sema.h b/osfmk/kern/sync_sema.h index 339eb9e93..2187c6bae 100644 --- a/osfmk/kern/sync_sema.h +++ b/osfmk/kern/sync_sema.h @@ -64,7 +64,7 @@ extern void semaphore_init(void); extern void semaphore_reference(semaphore_t semaphore); extern void semaphore_dereference(semaphore_t semaphore); -extern kern_return_t semaphore_destroy_internal(task_t task, semaphore_t semaphore); +extern void semaphore_destroy_all(task_t task); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 498fd09c8..d0e982ee4 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -189,6 +189,10 @@ struct _task_ledger_indices task_ledgers __attribute__((used)) = #endif }; +/* System sleep state */ +boolean_t tasks_suspend_state; + + void init_task_ledgers(void); void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1); void task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused const void *param1); @@ -968,6 +972,10 @@ task_create_internal( new_task->task_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info)); assert(new_task->task_io_stats != NULL); bzero(new_task->task_io_stats, sizeof(struct io_stat_info)); + new_task->task_immediate_writes = 0; + new_task->task_deferred_writes = 0; + new_task->task_invalidated_writes = 0; + new_task->task_metadata_writes = 0; bzero(&(new_task->cpu_time_qos_stats), sizeof(struct _cpu_time_qos_stats)); @@ -1015,6 +1023,9 @@ task_create_internal( lck_mtx_lock(&tasks_threads_lock); queue_enter(&tasks, new_task, task_t, tasks); tasks_count++; + if (tasks_suspend_state) { + task_suspend_internal(new_task); + } lck_mtx_unlock(&tasks_threads_lock); *child_task = new_task; @@ -1613,6 +1624,23 @@ task_terminate_internal( return (KERN_SUCCESS); } +void +tasks_system_suspend(boolean_t suspend) +{ + task_t task; + + lck_mtx_lock(&tasks_threads_lock); + assert(tasks_suspend_state != suspend); + tasks_suspend_state = suspend; + queue_iterate(&tasks, task, task_t, tasks) { + if (task == kernel_task) { + continue; + } + suspend ? task_suspend_internal(task) : task_resume_internal(task); + } + lck_mtx_unlock(&tasks_threads_lock); +} + /* * task_start_halt: * @@ -3831,16 +3859,10 @@ task_set_ras_pc( void task_synchronizer_destroy_all(task_t task) { - semaphore_t semaphore; - /* * Destroy owned semaphores */ - - while (!queue_empty(&task->semaphore_list)) { - semaphore = (semaphore_t) queue_first(&task->semaphore_list); - (void) semaphore_destroy_internal(task, semaphore); - } + semaphore_destroy_all(task); } /* @@ -4517,3 +4539,23 @@ boolean_t task_is_gpu_denied(task_t task) /* We don't need the lock to read this flag */ return (task->t_flags & TF_GPU_DENIED) ? TRUE : FALSE; } + +void task_update_logical_writes(task_t task, uint32_t io_size, int flags) +{ + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_DATA_WRITE)) | DBG_FUNC_NONE, task_pid(task), io_size, flags, 0, 0); + switch(flags) { + case TASK_WRITE_IMMEDIATE: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_immediate_writes)); + break; + case TASK_WRITE_DEFERRED: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_deferred_writes)); + break; + case TASK_WRITE_INVALIDATED: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_invalidated_writes)); + break; + case TASK_WRITE_METADATA: + OSAddAtomic64(io_size, (SInt64 *)&(task->task_metadata_writes)); + break; + } + return; +} diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index 7b7c15306..5ddff0c75 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -352,8 +352,12 @@ struct task { low_mem_privileged_listener :1, /* if set, task would like to know about pressure changes before other tasks on the system */ mem_notify_reserved :27; /* reserved for future use */ - io_stat_info_t task_io_stats; - + io_stat_info_t task_io_stats; + uint64_t task_immediate_writes __attribute__((aligned(8))); + uint64_t task_deferred_writes __attribute__((aligned(8))); + uint64_t task_invalidated_writes __attribute__((aligned(8))); + uint64_t task_metadata_writes __attribute__((aligned(8))); + /* * The cpu_time_qos_stats fields are protected by the task lock */ @@ -490,6 +494,8 @@ extern kern_return_t task_send_trace_memory( uint32_t pid, uint64_t uniqueid); +extern void tasks_system_suspend(boolean_t suspend); + #if CONFIG_FREEZE /* Freeze a task's resident pages */ @@ -851,6 +857,12 @@ extern kern_return_t task_purge_volatile_memory(task_t task); extern void task_set_gpu_denied(task_t task, boolean_t denied); extern boolean_t task_is_gpu_denied(task_t task); +#define TASK_WRITE_IMMEDIATE 0x1 +#define TASK_WRITE_DEFERRED 0x2 +#define TASK_WRITE_INVALIDATED 0x4 +#define TASK_WRITE_METADATA 0x8 +extern void task_update_logical_writes(task_t task, uint32_t io_size, int flags); + #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index 6e9a472a0..535d1e319 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -1920,18 +1920,6 @@ THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU__SENDING_EXC_RESOURCE(void) } } -#define UPDATE_IO_STATS(info, size) \ -{ \ - info.count++; \ - info.size += size; \ -} - -#define UPDATE_IO_STATS_ATOMIC(info, size) \ -{ \ - OSIncrementAtomic64((SInt64 *)&(info.count)); \ - OSAddAtomic64(size, (SInt64 *)&(info.size)); \ -} - void thread_update_io_stats(thread_t thread, int size, int io_flags) { int io_tier; diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index 82339e208..6e406e8b7 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -1207,7 +1207,7 @@ thread_call_thread( #if DEVELOPMENT || DEBUG KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_CALLOUT) | DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE(func), param0, param1, 0, 0); + VM_KERNEL_UNSLIDE(func), VM_KERNEL_UNSLIDE_OR_PERM(param0), VM_KERNEL_UNSLIDE_OR_PERM(param1), 0, 0); #endif /* DEVELOPMENT || DEBUG */ #if CONFIG_DTRACE diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index 8c65ed2f8..fead4d663 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -417,9 +417,9 @@ timer_call_enqueue_deadline_unlocked( #if TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), call->async_dequeue, - TCE(call)->queue, + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), 0x1c, 0); timer_call_enqueue_deadline_unlocked_async1++; #endif @@ -471,9 +471,9 @@ timer_call_dequeue_unlocked( #if TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), call->async_dequeue, - TCE(call)->queue, + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), 0, 0); #endif if (old_queue != NULL) { @@ -483,9 +483,9 @@ timer_call_dequeue_unlocked( #if TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), call->async_dequeue, - TCE(call)->queue, + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), 0x1c, 0); timer_call_dequeue_unlocked_async1++; #endif @@ -570,8 +570,8 @@ timer_call_enter_internal( TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START, - call, - param1, deadline, flags, 0); + VM_KERNEL_UNSLIDE_OR_PERM(call), + VM_KERNEL_UNSLIDE_OR_PERM(param1), deadline, flags, 0); urgency = (flags & TIMER_CALL_URGENCY_MASK); @@ -634,7 +634,7 @@ timer_call_enter_internal( TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), (old_queue != NULL), deadline, queue->count, 0); splx(s); @@ -688,7 +688,7 @@ timer_call_cancel( TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), TCE(call)->deadline, call->soft_deadline, call->flags, 0); old_queue = timer_call_dequeue_unlocked(call); @@ -708,8 +708,8 @@ timer_call_cancel( } TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END, - call, - old_queue, + VM_KERNEL_UNSLIDE_OR_PERM(call), + VM_KERNEL_UNSLIDE_OR_PERM(old_queue), TCE(call)->deadline - mach_absolute_time(), TCE(call)->deadline - TCE(call)->entry_time, 0); splx(s); @@ -754,9 +754,9 @@ timer_queue_shutdown( #if TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), call->async_dequeue, - TCE(call)->queue, + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), 0x2b, 0); #endif timer_queue_unlock(queue); @@ -824,7 +824,7 @@ timer_queue_expire_with_options( TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0); TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_EXPIRE | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), call->soft_deadline, TCE(call)->deadline, TCE(call)->entry_time, 0); @@ -854,7 +854,10 @@ timer_queue_expire_with_options( TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START, - call, VM_KERNEL_UNSLIDE(func), param0, param1, 0); + VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func), + VM_KERNEL_UNSLIDE_OR_PERM(param0), + VM_KERNEL_UNSLIDE_OR_PERM(param1), + 0); #if CONFIG_DTRACE DTRACE_TMR7(callout__start, timer_call_func_t, func, @@ -876,7 +879,10 @@ timer_queue_expire_with_options( TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END, - call, VM_KERNEL_UNSLIDE(func), param0, param1, 0); + VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func), + VM_KERNEL_UNSLIDE_OR_PERM(param0), + VM_KERNEL_UNSLIDE_OR_PERM(param1), + 0); call = NULL; timer_queue_lock_spin(queue); } else { @@ -1013,9 +1019,9 @@ timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to) #ifdef TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, - TCE(call)->queue, - call->lock.interlock.lock_data, + VM_KERNEL_UNSLIDE_OR_PERM(call), + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), + VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data), 0x2b, 0); #endif timer_queue_migrate_lock_skips++; @@ -1071,7 +1077,7 @@ timer_queue_trace( call->soft_deadline, TCE(call)->deadline, TCE(call)->entry_time, - TCE(call)->func, + VM_KERNEL_UNSLIDE(TCE(call)->func), 0); call = TIMER_CALL(queue_next(qe(call))); } while (!queue_end(&queue->head, qe(call))); @@ -1223,9 +1229,9 @@ timer_longterm_scan(timer_longterm_t *tlp, #ifdef TIMER_ASSERT TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE, - call, - TCE(call)->queue, - call->lock.interlock.lock_data, + VM_KERNEL_UNSLIDE_OR_PERM(call), + VM_KERNEL_UNSLIDE_OR_PERM(TCE(call)->queue), + VM_KERNEL_UNSLIDE_OR_PERM(call->lock.interlock.lock_data), 0x2c, 0); #endif timer_call_entry_dequeue_async(call); @@ -1240,7 +1246,7 @@ timer_longterm_scan(timer_longterm_t *tlp, if (deadline < now) TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_OVERDUE | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, now, threshold, @@ -1248,10 +1254,10 @@ timer_longterm_scan(timer_longterm_t *tlp, #endif TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ESCALATE | DBG_FUNC_NONE, - call, + VM_KERNEL_UNSLIDE_OR_PERM(call), TCE(call)->deadline, TCE(call)->entry_time, - TCE(call)->func, + VM_KERNEL_UNSLIDE(TCE(call)->func), 0); tlp->escalates++; timer_call_entry_dequeue(call); @@ -1289,7 +1295,7 @@ timer_longterm_update_locked(timer_longterm_t *tlp) TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_UPDATE | DBG_FUNC_START, - &tlp->queue, + VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue), tlp->threshold.deadline, tlp->threshold.preempted, tlp->queue.count, 0); @@ -1336,7 +1342,7 @@ timer_longterm_update_locked(timer_longterm_t *tlp) TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_UPDATE | DBG_FUNC_END, - &tlp->queue, + VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue), tlp->threshold.deadline, tlp->threshold.scans, tlp->queue.count, 0); diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c index be41d6143..fad26e5ba 100644 --- a/osfmk/kern/waitq.c +++ b/osfmk/kern/waitq.c @@ -584,9 +584,9 @@ struct wqt_elem *wq_table_alloc_elem(struct wq_table *table, int type, int nelem type, table); assert(nelem > 0); - elem = NULL; try_again: + elem = NULL; if (ntries++ > max_retries) { struct wqt_elem *tmp; if (table->used_elem + nelem >= table_size) @@ -4568,7 +4568,7 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * WQS we're unlinking, or to an invalid object: * no need to invalidate it */ - *wq_setid = right->sl_set_id.id; + *wq_setid = right ? right->sl_set_id.id : 0; lt_invalidate(parent); wqdbg_v("S1, L"); return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; @@ -4578,7 +4578,7 @@ static inline int waitq_maybe_remove_link(struct waitq *waitq, * WQS we're unlinking, or to an invalid object: * no need to invalidate it */ - *wq_setid = left->sl_set_id.id; + *wq_setid = left ? left->sl_set_id.id : 0; lt_invalidate(parent); wqdbg_v("S1, R"); return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID; diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 6bcab735d..351268d34 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -375,6 +375,13 @@ uint64_t zone_map_table_page_count = 0; vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */ vm_offset_t zone_map_max_address = 0; +/* Globals for random boolean generator for elements in free list */ +#define MAX_ENTROPY_PER_ZCRAM 4 +#define RANDOM_BOOL_GEN_SEED_COUNT 4 +static unsigned int bool_gen_seed[RANDOM_BOOL_GEN_SEED_COUNT]; +static unsigned int bool_gen_global = 0; +decl_simple_lock_data(, bool_gen_lock) + /* Helpful for walking through a zone's free element list. */ struct zone_free_element { struct zone_free_element *next; @@ -1924,6 +1931,84 @@ zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) { thread_deallocate(z->zone_replenish_thread); } +/* + * Boolean Random Number Generator for generating booleans to randomize + * the order of elements in newly zcram()'ed memory. The algorithm is a + * modified version of the KISS RNG proposed in the paper: + * http://stat.fsu.edu/techreports/M802.pdf + * The modifications have been documented in the technical paper + * paper from UCL: + * http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf + */ + +static void random_bool_gen_entropy( + int *buffer, + int count) +{ + + int i, t; + simple_lock(&bool_gen_lock); + for (i = 0; i < count; i++) { + bool_gen_seed[1] ^= (bool_gen_seed[1] << 5); + bool_gen_seed[1] ^= (bool_gen_seed[1] >> 7); + bool_gen_seed[1] ^= (bool_gen_seed[1] << 22); + t = bool_gen_seed[2] + bool_gen_seed[3] + bool_gen_global; + bool_gen_seed[2] = bool_gen_seed[3]; + bool_gen_global = t < 0; + bool_gen_seed[3] = t &2147483647; + bool_gen_seed[0] += 1411392427; + buffer[i] = (bool_gen_seed[0] + bool_gen_seed[1] + bool_gen_seed[3]); + } + simple_unlock(&bool_gen_lock); +} + +static boolean_t random_bool_gen( + int *buffer, + int index, + int bufsize) +{ + int valindex, bitpos; + valindex = (index / (8 * sizeof(int))) % bufsize; + bitpos = index % (8 * sizeof(int)); + return (boolean_t)(buffer[valindex] & (1 << bitpos)); +} + +static void +random_free_to_zone( + zone_t zone, + vm_offset_t newmem, + vm_offset_t first_element_offset, + int element_count, + boolean_t from_zm, + int *entropy_buffer) +{ + vm_offset_t last_element_offset; + vm_offset_t element_addr; + vm_size_t elem_size; + int index; + + elem_size = zone->elem_size; + last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size); + for (index = 0; index < element_count; index++) { + assert(first_element_offset <= last_element_offset); + if (random_bool_gen(entropy_buffer, index, MAX_ENTROPY_PER_ZCRAM)) { + element_addr = newmem + first_element_offset; + first_element_offset += elem_size; + } else { + element_addr = newmem + last_element_offset; + last_element_offset -= elem_size; + } + if (element_addr != (vm_offset_t)zone) { + zone->count++; /* compensate for free_to_zone */ + free_to_zone(zone, element_addr, FALSE); + } + if (!zone->use_page_list && from_zm) { + zone_page_alloc(element_addr, elem_size); + } + zone->cur_size += elem_size; + } +} + /* * Cram the given memory into the specified zone. Update the zone page count accordingly. */ @@ -1935,6 +2020,9 @@ zcram( { vm_size_t elem_size; boolean_t from_zm = FALSE; + vm_offset_t first_element_offset; + int element_count; + int entropy_buffer[MAX_ENTROPY_PER_ZCRAM]; /* Basic sanity checks */ assert(zone != ZONE_NULL && newmem != (vm_offset_t)0); @@ -1943,6 +2031,8 @@ zcram( elem_size = zone->elem_size; + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, VM_KERNEL_ADDRPERM(zone), size, 0, 0, 0); + if (from_zone_map(newmem, size)) from_zm = TRUE; @@ -1955,6 +2045,8 @@ zcram( ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE)); + random_bool_gen_entropy(entropy_buffer, MAX_ENTROPY_PER_ZCRAM); + lock_zone(zone); if (zone->use_page_list) { @@ -1965,7 +2057,6 @@ zcram( assert((size & PAGE_MASK) == 0); for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { - vm_size_t pos_in_page; page_metadata = (struct zone_page_metadata *)(newmem); page_metadata->pages.next = NULL; @@ -1977,36 +2068,24 @@ zcram( enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_metadata); - vm_offset_t first_element_offset; if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0){ first_element_offset = zone_page_metadata_size; } else { first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT)); } - - for (pos_in_page = first_element_offset; (newmem + pos_in_page + elem_size) < (vm_offset_t)(newmem + PAGE_SIZE); pos_in_page += elem_size) { - page_metadata->alloc_count++; - zone->count++; /* compensate for free_to_zone */ - free_to_zone(zone, newmem + pos_in_page, FALSE); - zone->cur_size += elem_size; - } - } - } else { - while (size >= elem_size) { - zone->count++; /* compensate for free_to_zone */ - if (newmem == (vm_offset_t)zone) { - /* Don't free zone_zone zone */ - } else { - free_to_zone(zone, newmem, FALSE); - } - if (from_zm) - zone_page_alloc(newmem, elem_size); - size -= elem_size; - newmem += elem_size; - zone->cur_size += elem_size; + element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size); + page_metadata->alloc_count += element_count; + random_free_to_zone(zone, newmem, first_element_offset, element_count, from_zm, entropy_buffer); } + } else { + first_element_offset = 0; + element_count = (int)((size - first_element_offset) / elem_size); + random_free_to_zone(zone, newmem, first_element_offset, element_count, from_zm, entropy_buffer); } unlock_zone(zone); + + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, VM_KERNEL_ADDRPERM(zone), 0, 0, 0, 0); + } @@ -2070,6 +2149,7 @@ void zone_bootstrap(void) { char temp_buf[16]; + unsigned int i; if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) { zinfo_per_task = TRUE; @@ -2081,6 +2161,12 @@ zone_bootstrap(void) /* Set up zone element poisoning */ zp_init(); + /* Seed the random boolean generator for elements in zone free list */ + for (i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) { + bool_gen_seed[i] = (unsigned int)early_random(); + } + simple_lock_init(&bool_gen_lock, 0); + /* should zlog log to debug zone corruption instead of leaks? */ if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) { corruption_debug_flag = TRUE; @@ -2139,7 +2225,6 @@ zone_bootstrap(void) /* initialize fake zones and zone info if tracking by task */ if (zinfo_per_task) { vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS; - unsigned int i; for (i = 0; i < num_fake_zones; i++) fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i); diff --git a/osfmk/mach/coalition.h b/osfmk/mach/coalition.h index 2c0b22e41..6b2038de4 100644 --- a/osfmk/mach/coalition.h +++ b/osfmk/mach/coalition.h @@ -88,6 +88,10 @@ struct coalition_resource_usage { uint64_t gpu_time; uint64_t cpu_time_billed_to_me; uint64_t cpu_time_billed_to_others; + uint64_t logical_immediate_writes; + uint64_t logical_deferred_writes; + uint64_t logical_invalidated_writes; + uint64_t logical_metadata_writes; }; #ifdef PRIVATE diff --git a/osfmk/mach/kern_return.h b/osfmk/mach/kern_return.h index bfedcc3ec..f0fa37d27 100644 --- a/osfmk/mach/kern_return.h +++ b/osfmk/mach/kern_return.h @@ -319,6 +319,10 @@ /* The requested property cannot be changed at this time. */ +#define KERN_INSUFFICIENT_BUFFER_SIZE 52 + /* The provided buffer is of insufficient size for the requested data. + */ + #define KERN_RETURN_MAX 0x100 /* Maximum return value allowable */ diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 09ea8bb8a..9c459178a 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -405,7 +405,6 @@ __END_DECLS #define CPUFAMILY_ARM_SWIFT 0x1e2d6381 #define CPUFAMILY_ARM_CYCLONE 0x37a09642 #define CPUFAMILY_ARM_TYPHOON 0x2c91a47e -#define CPUFAMILY_ARM_TWISTER 0x92fb37c8 /* The following synonyms are deprecated: */ #define CPUFAMILY_INTEL_6_14 CPUFAMILY_INTEL_YONAH diff --git a/osfmk/mach/thread_info.h b/osfmk/mach/thread_info.h index c4794aab5..e39523ffc 100644 --- a/osfmk/mach/thread_info.h +++ b/osfmk/mach/thread_info.h @@ -182,16 +182,19 @@ typedef struct thread_debug_info_internal thread_debug_info_internal_data_t; #endif /* PRIVATE */ +#define IO_NUM_PRIORITIES 4 -/* - * Obsolete interfaces. - */ - -#define THREAD_SCHED_TIMESHARE_INFO 10 -#define THREAD_SCHED_RR_INFO 11 -#define THREAD_SCHED_FIFO_INFO 12 +#define UPDATE_IO_STATS(info, size) \ +{ \ + info.count++; \ + info.size += size; \ +} -#define IO_NUM_PRIORITIES 4 +#define UPDATE_IO_STATS_ATOMIC(info, io_size) \ +{ \ + OSIncrementAtomic64((SInt64 *)&(info.count)); \ + OSAddAtomic64(io_size, (SInt64 *)&(info.size)); \ +} struct io_stat_entry { uint64_t count; @@ -208,4 +211,12 @@ struct io_stat_info { typedef struct io_stat_info *io_stat_info_t; +/* + * Obsolete interfaces. + */ + +#define THREAD_SCHED_TIMESHARE_INFO 10 +#define THREAD_SCHED_RR_INFO 11 +#define THREAD_SCHED_FIFO_INFO 12 + #endif /* _MACH_THREAD_INFO_H_ */ diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 6a33043a6..1fa361488 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -308,7 +308,9 @@ extern vm_offset_t vm_elinkedit; * VM_KERNEL_UNSLIDE_OR_ADDRPERM: * Use this macro when you are exposing an address to userspace that could * come from either kernel text/data *or* the heap. This is a rare case, - * but one that does come up and must be handled correctly. + * but one that does come up and must be handled correctly. If the argument + * is known to be lower than any potential heap address, no transformation + * is applied, to avoid revealing the operation on a constant. * * Nesting of these macros should be considered invalid. */ @@ -333,7 +335,7 @@ extern vm_offset_t vm_elinkedit; VM_KERNEL_IS_PRELINKINFO(_v) || \ VM_KERNEL_IS_KEXT_LINKEDIT(_v)) ? \ (vm_offset_t)(_v) - vm_kernel_slide : \ - VM_KERNEL_ADDRPERM(_v)) + ((vm_offset_t)(_v) >= VM_MIN_KERNEL_AND_KEXT_ADDRESS ? VM_KERNEL_ADDRPERM(_v) : (vm_offset_t)(_v))) #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/mach_debug/mach_debug_types.defs b/osfmk/mach_debug/mach_debug_types.defs index 07e906a6f..51e62a584 100644 --- a/osfmk/mach_debug/mach_debug_types.defs +++ b/osfmk/mach_debug/mach_debug_types.defs @@ -102,7 +102,7 @@ type page_address_array_t = ^array[] of integer_t; type symtab_name_t = c_string[*:32]; -type lockgroup_info_t = struct[63] of integer_t; +type lockgroup_info_t = struct[33] of uint64_t; type lockgroup_info_array_t = array[] of lockgroup_info_t; type mach_memory_info_t = struct[8] of uint64_t; diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index f7e485bde..2dbe896b4 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -375,6 +375,7 @@ memory_object_control_uiomove( int i; int orig_offset; vm_page_t page_run[MAX_RUN]; + int dirty_count; /* keeps track of number of pages dirtied as part of this uiomove */ object = memory_object_control_to_vm_object(control); if (object == VM_OBJECT_NULL) { @@ -395,14 +396,15 @@ memory_object_control_uiomove( return 0; } orig_offset = start_offset; - + + dirty_count = 0; while (io_requested && retval == 0) { cur_needed = (start_offset + io_requested + (PAGE_SIZE - 1)) / PAGE_SIZE; if (cur_needed > MAX_RUN) cur_needed = MAX_RUN; - + for (cur_run = 0; cur_run < cur_needed; ) { if ((dst_page = vm_page_lookup(object, offset)) == VM_PAGE_NULL) @@ -435,6 +437,8 @@ memory_object_control_uiomove( assert(!dst_page->encrypted); if (mark_dirty) { + if (dst_page->dirty == FALSE) + dirty_count++; SET_PAGE_DIRTY(dst_page, FALSE); if (dst_page->cs_validated && !dst_page->cs_tainted) { @@ -518,7 +522,7 @@ memory_object_control_uiomove( orig_offset = 0; } vm_object_unlock(object); - + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED); return (retval); } diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index e8ace8c9b..848b1eea8 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -537,10 +537,12 @@ vm_object_update_extent( struct vm_page_delayed_work *dwp; int dw_count; int dw_limit; + int dirty_count; dwp = &dw_array[0]; dw_count = 0; dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + dirty_count = 0; for (; offset < offset_end && object->resident_page_count; @@ -595,6 +597,8 @@ vm_object_update_extent( break; case MEMORY_OBJECT_LOCK_RESULT_MUST_FREE: + if (m->dirty == TRUE) + dirty_count++; dwp->dw_mask |= DW_vm_page_free; break; @@ -646,6 +650,10 @@ vm_object_update_extent( break; } } + + if (dirty_count) { + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_INVALIDATED); + } /* * We have completed the scan for applicable pages. * Clean any pages that have been saved. diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 9d72bf588..8a4b26961 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -658,10 +658,10 @@ extern void pmap_unmap_sharedpage(pmap_t pmap); void pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr); #endif -unsigned int pmap_query_resident(pmap_t pmap, - vm_map_offset_t s, - vm_map_offset_t e, - unsigned int *compressed_count_p); +mach_vm_size_t pmap_query_resident(pmap_t pmap, + vm_map_offset_t s, + vm_map_offset_t e, + mach_vm_size_t *compressed_bytes_p); #if CONFIG_PGTRACE int pmap_pgtrace_add_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index cf05d1950..64b7d7bd9 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -324,7 +324,10 @@ vm_wants_task_throttled(task_t task) if (task == kernel_task) return (0); - if (vm_compressor_mode == COMPRESSED_PAGER_IS_ACTIVE || vm_compressor_mode == DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { + if (COMPRESSED_PAGER_IS_SWAPLESS || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS) + return (0); + + if (COMPRESSED_PAGER_IS_SWAPBACKED || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) { if ((vm_compressor_low_on_space() || HARD_THROTTLE_LIMIT_REACHED()) && (unsigned int)pmap_compressed(task->map->pmap) > (c_segment_pages_compressed / 4)) return (1); @@ -2152,6 +2155,9 @@ do_fastwake_warmup(void) c_seg->c_generation_id > last_c_segment_to_warm_generation_id) break; + if (vm_page_free_count < (AVAILABLE_MEMORY / 4)) + break; + lck_mtx_lock_spin_always(&c_seg->c_lock); lck_mtx_unlock_always(c_list_lock); diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 663199f2a..655c302d2 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -3192,7 +3192,8 @@ MACRO_END if (m->wpmapped == FALSE) { vm_object_lock_assert_exclusive(m->object); - + if (!m->object->internal) + task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED); m->wpmapped = TRUE; } if (must_disconnect) { @@ -4827,7 +4828,10 @@ vm_fault_internal( int superpage; if (!object->pager_created && - object->phys_contiguous) { + object->phys_contiguous && + VME_OFFSET(entry) == 0 && + (entry->vme_end - entry->vme_start == object->vo_size) && + VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size-1))) { superpage = VM_MEM_SUPERPAGE; } else { superpage = 0; diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index d015ebd2c..c75b23835 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -281,9 +281,10 @@ kernel_memory_allocate( * limit the size of a single extent of wired memory * to try and limit the damage to the system if * too many pages get wired down - * limit raised to 2GB with 128GB max physical limit + * limit raised to 2GB with 128GB max physical limit, + * but scaled by installed memory above this */ - if ( !(flags & KMA_VAONLY) && map_size > (1ULL << 31)) { + if ( !(flags & KMA_VAONLY) && map_size > MAX(1ULL<<31, sane_size/64)) { return KERN_RESOURCE_SHORTAGE; } diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 7800e4a23..ac2edb6a5 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -4999,6 +4999,7 @@ vm_map_wire_nested( &real_map)) { vm_map_unlock_read(lookup_map); + assert(map_pmap == NULL); vm_map_unwire(map, start, s, user_wire); return(KERN_FAILURE); @@ -5347,7 +5348,8 @@ vm_map_wire_nested( if (rc != KERN_SUCCESS) { /* undo what has been wired so far */ - vm_map_unwire(map, start, s, user_wire); + vm_map_unwire_nested(map, start, s, user_wire, + map_pmap, pmap_addr); if (physpage_p) { *physpage_p = 0; } @@ -9152,13 +9154,35 @@ vm_map_copyin_common( __unused boolean_t src_volatile, vm_map_copy_t *copy_result, /* OUT */ boolean_t use_maxprot) +{ + int flags; + + flags = 0; + if (src_destroy) { + flags |= VM_MAP_COPYIN_SRC_DESTROY; + } + if (use_maxprot) { + flags |= VM_MAP_COPYIN_USE_MAXPROT; + } + return vm_map_copyin_internal(src_map, + src_addr, + len, + flags, + copy_result); +} +kern_return_t +vm_map_copyin_internal( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + int flags, + vm_map_copy_t *copy_result) /* OUT */ { vm_map_entry_t tmp_entry; /* Result of last map lookup -- * in multi-level lookup, this * entry contains the actual * vm_object/offset. */ - register vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */ vm_map_offset_t src_start; /* Start of current entry -- @@ -9171,10 +9195,18 @@ vm_map_copyin_common( boolean_t map_share=FALSE; submap_map_t *parent_maps = NULL; - register vm_map_copy_t copy; /* Resulting copy */ vm_map_address_t copy_addr; vm_map_size_t copy_size; + boolean_t src_destroy; + boolean_t use_maxprot; + + if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + + src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE; + use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE; /* * Check for copies of zero bytes. @@ -9198,7 +9230,9 @@ vm_map_copyin_common( * setting up VM (and taking C-O-W faults) dominates the copy costs * for small regions. */ - if ((len < msg_ool_size_small) && !use_maxprot) + if ((len < msg_ool_size_small) && + !use_maxprot && + !(flags & VM_MAP_COPYIN_ENTRY_LIST)) return vm_map_copyin_kernel_buffer(src_map, src_addr, len, src_destroy, copy_result); @@ -15885,7 +15919,6 @@ vm_map_query_volatile( mach_vm_size_t volatile_pmap_count; mach_vm_size_t volatile_compressed_pmap_count; mach_vm_size_t resident_count; - unsigned int compressed_count; vm_map_entry_t entry; vm_object_t object; @@ -15900,6 +15933,8 @@ vm_map_query_volatile( for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { + mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes; + if (entry->is_sub_map) { continue; } @@ -15937,12 +15972,15 @@ vm_map_query_volatile( volatile_compressed_count += vm_compressor_pager_get_count(object->pager); } - compressed_count = 0; - volatile_pmap_count += pmap_query_resident(map->pmap, - entry->vme_start, - entry->vme_end, - &compressed_count); - volatile_compressed_pmap_count += compressed_count; + pmap_compressed_bytes = 0; + pmap_resident_bytes = + pmap_query_resident(map->pmap, + entry->vme_start, + entry->vme_end, + &pmap_compressed_bytes); + volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE); + volatile_compressed_pmap_count += (pmap_compressed_bytes + / PAGE_SIZE); } /* map is still locked on return */ diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 7f53e5325..44c987925 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -1159,6 +1159,17 @@ extern kern_return_t vm_map_copyin_common( vm_map_copy_t *copy_result, /* OUT */ boolean_t use_maxprot); +#define VM_MAP_COPYIN_SRC_DESTROY 0x00000001 +#define VM_MAP_COPYIN_USE_MAXPROT 0x00000002 +#define VM_MAP_COPYIN_ENTRY_LIST 0x00000004 +#define VM_MAP_COPYIN_ALL_FLAGS 0x00000007 +extern kern_return_t vm_map_copyin_internal( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + int flags, + vm_map_copy_t *copy_result); /* OUT */ + extern kern_return_t vm_map_copy_extract( vm_map_t src_map, vm_map_address_t src_addr, diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index a2d77426b..35c9ba57b 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -6632,6 +6632,10 @@ vm_object_lock_request( void vm_object_purge(vm_object_t object, int flags) { + unsigned int object_page_count = 0; + unsigned int pgcount = 0; + boolean_t skipped_object = FALSE; + vm_object_lock_assert_exclusive(object); if (object->purgable == VM_PURGABLE_DENY) @@ -6677,11 +6681,12 @@ vm_object_purge(vm_object_t object, int flags) } assert(object->purgable == VM_PURGABLE_EMPTY); + object_page_count = object->resident_page_count; + vm_object_reap_pages(object, REAP_PURGEABLE); if (object->pager != NULL && COMPRESSED_PAGER_IS_ACTIVE) { - unsigned int pgcount; if (object->activity_in_progress == 0 && object->paging_in_progress == 0) { @@ -6726,10 +6731,19 @@ vm_object_purge(vm_object_t object, int flags) * pager if there's any kind of operation in * progress on the VM object. */ + skipped_object = TRUE; } } vm_object_lock_assert_exclusive(object); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE_ONE)), + VM_KERNEL_UNSLIDE_OR_PERM(object), /* purged object */ + object_page_count, + pgcount, + skipped_object, + 0); + } diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 53185086d..2fca44b36 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -322,6 +322,9 @@ static void vm_pageout_immediate(vm_page_t, boolean_t); boolean_t vm_compressor_immediate_preferred = FALSE; boolean_t vm_compressor_immediate_preferred_override = FALSE; boolean_t vm_restricted_to_single_processor = FALSE; +static boolean_t vm_pageout_waiter = FALSE; +static boolean_t vm_pageout_running = FALSE; + static thread_t vm_pageout_external_iothread = THREAD_NULL; static thread_t vm_pageout_internal_iothread = THREAD_NULL; @@ -349,7 +352,6 @@ int vm_upl_wait_for_pages = 0; */ unsigned int vm_pageout_active = 0; /* debugging */ -unsigned int vm_pageout_active_busy = 0; /* debugging */ unsigned int vm_pageout_inactive = 0; /* debugging */ unsigned int vm_pageout_inactive_throttled = 0; /* debugging */ unsigned int vm_pageout_inactive_forced = 0; /* debugging */ @@ -3126,6 +3128,10 @@ vm_pageout_continue(void) DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); vm_pageout_scan_event_counter++; + lck_mtx_lock(&vm_page_queue_free_lock); + vm_pageout_running = TRUE; + lck_mtx_unlock(&vm_page_queue_free_lock); + vm_pageout_scan(); /* * we hold both the vm_page_queue_free_lock @@ -3135,6 +3141,12 @@ vm_pageout_continue(void) assert(vm_page_free_wanted_privileged == 0); assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); + vm_pageout_running = FALSE; + if (vm_pageout_waiter) { + vm_pageout_waiter = FALSE; + thread_wakeup((event_t)&vm_pageout_waiter); + } + lck_mtx_unlock(&vm_page_queue_free_lock); vm_page_unlock_queues(); @@ -3143,6 +3155,25 @@ vm_pageout_continue(void) /*NOTREACHED*/ } +kern_return_t +vm_pageout_wait(uint64_t deadline) +{ + kern_return_t kr; + + lck_mtx_lock(&vm_page_queue_free_lock); + for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) { + vm_pageout_waiter = TRUE; + if (THREAD_AWAKENED != lck_mtx_sleep_deadline( + &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT, + (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) { + kr = KERN_OPERATION_TIMED_OUT; + } + } + lck_mtx_unlock(&vm_page_queue_free_lock); + + return (kr); +} + #ifdef FAKE_DEADLOCK diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index 35ab0b343..bd7cb800a 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -131,6 +131,8 @@ extern int vm_debug_events; #define VM_EXECVE 0x131 #define VM_WAKEUP_COMPACTOR_SWAPPER 0x132 +#define VM_DATA_WRITE 0x140 + #define VM_DEBUG_EVENT(name, event, control, arg1, arg2, arg3, arg4) \ MACRO_BEGIN \ if (vm_debug_events) { \ @@ -186,6 +188,8 @@ extern vm_page_t vm_page_get_next(vm_page_t page); extern kern_return_t mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level); +extern kern_return_t vm_pageout_wait(uint64_t deadline); + #ifdef MACH_KERNEL_PRIVATE #include diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index 6df155404..3c6807cb5 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -71,14 +71,6 @@ int purgeable_nonvolatile_count; decl_lck_mtx_data(,vm_purgeable_queue_lock) -#define TOKEN_ADD 0x40 /* 0x100 */ -#define TOKEN_DELETE 0x41 /* 0x104 */ -#define TOKEN_RIPEN 0x42 /* 0x108 */ -#define OBJECT_ADD 0x48 /* 0x120 */ -#define OBJECT_REMOVE 0x49 /* 0x124 */ -#define OBJECT_PURGE 0x4a /* 0x128 */ -#define OBJECT_PURGE_ALL 0x4b /* 0x12c */ - static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue); static void vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int group, task_t target_task); @@ -688,6 +680,8 @@ vm_purgeable_object_find_and_lock( int best_object_task_importance; int best_object_skipped; int num_objects_skipped; + int try_lock_failed = 0; + int try_lock_succeeded = 0; task_t owner; best_object = VM_OBJECT_NULL; @@ -700,12 +694,29 @@ vm_purgeable_object_find_and_lock( * remaining elements in order. */ - num_objects_skipped = -1; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE_LOOP) | DBG_FUNC_START), + pick_ripe, + group, + VM_KERNEL_UNSLIDE_OR_PERM(queue), + 0, + 0); + + num_objects_skipped = 0; for (object = (vm_object_t) queue_first(&queue->objq[group]); !queue_end(&queue->objq[group], (queue_entry_t) object); object = (vm_object_t) queue_next(&object->objq), num_objects_skipped++) { + /* + * To prevent us looping for an excessively long time, choose + * the best object we've seen after looking at PURGEABLE_LOOP_MAX elements. + * If we haven't seen an eligible object after PURGEABLE_LOOP_MAX elements, + * we keep going until we find the first eligible object. + */ + if ((num_objects_skipped >= PURGEABLE_LOOP_MAX) && (best_object != NULL)) { + break; + } + if (pick_ripe && ! object->purgeable_when_ripe) { /* we want an object that has a ripe token */ @@ -721,6 +732,7 @@ vm_purgeable_object_find_and_lock( if (object_task_importance < best_object_task_importance) { if (vm_object_lock_try(object)) { + try_lock_succeeded++; if (best_object != VM_OBJECT_NULL) { /* forget about previous best object */ vm_object_unlock(best_object); @@ -732,9 +744,19 @@ vm_purgeable_object_find_and_lock( /* can't get any better: stop looking */ break; } + } else { + try_lock_failed++; } } } + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE_LOOP) | DBG_FUNC_END), + num_objects_skipped, /* considered objects */ + try_lock_failed, + try_lock_succeeded, + VM_KERNEL_UNSLIDE_OR_PERM(best_object), + ((best_object == NULL) ? 0 : best_object->resident_page_count)); + object = best_object; if (object == VM_OBJECT_NULL) { diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index c958f6b0e..c982a6307 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -128,4 +128,16 @@ void vm_purgeable_accounting(vm_object_t object, void vm_purgeable_compressed_update(vm_object_t object, int delta); +#define PURGEABLE_LOOP_MAX 64 + +#define TOKEN_ADD 0x40 /* 0x100 */ +#define TOKEN_DELETE 0x41 /* 0x104 */ +#define TOKEN_RIPEN 0x42 /* 0x108 */ +#define OBJECT_ADD 0x48 /* 0x120 */ +#define OBJECT_REMOVE 0x49 /* 0x124 */ +#define OBJECT_PURGE 0x4a /* 0x128 */ +#define OBJECT_PURGE_ALL 0x4b /* 0x12c */ +#define OBJECT_PURGE_ONE 0x4c /* 0x12d */ +#define OBJECT_PURGE_LOOP 0x4e /* 0x12e */ + #endif /* __VM_PURGEABLE_INTERNAL__ */ diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 0f1c6c990..25c1edb26 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -3104,14 +3104,16 @@ vm_page_unwire( VM_PAGE_CHECK(mem); assert(VM_PAGE_WIRED(mem)); + assert(!mem->gobbled); assert(mem->object != VM_OBJECT_NULL); #if DEBUG vm_object_lock_assert_exclusive(mem->object); lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); #endif if (--mem->wire_count == 0) { - assert(!mem->private && !mem->fictitious); - vm_page_wire_count--; + if (!mem->private && !mem->fictitious) { + vm_page_wire_count--; + } assert(mem->object->wired_page_count > 0); mem->object->wired_page_count--; if (!mem->object->wired_page_count) { diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 7b1eb0703..8ed0fc483 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -2114,11 +2114,11 @@ mach_make_memory_entry_64( offset_in_page = 0; } - kr = vm_map_copyin(target_map, - map_start, - map_size, - FALSE, - ©); + kr = vm_map_copyin_internal(target_map, + map_start, + map_size, + VM_MAP_COPYIN_ENTRY_LIST, + ©); if (kr != KERN_SUCCESS) { return kr; } diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index 0dc07f850..edba4c4db 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -307,9 +307,9 @@ L_common_dispatch: shr $32, %rcx testl %ecx, %ecx jz 4f - movl $0, %gs:CPU_TLB_INVALID testl $(1<<16), %ecx /* Global? */ jz 3f + movl $0, %gs:CPU_TLB_INVALID mov %cr4, %rcx /* RMWW CR4, for lack of an alternative*/ and $(~CR4_PGE), %rcx mov %rcx, %cr4 @@ -317,6 +317,7 @@ L_common_dispatch: mov %rcx, %cr4 jmp 4f 3: + movb $0, %gs:CPU_TLB_INVALID_LOCAL mov %cr3, %rcx mov %rcx, %cr3 4: diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 69a3bdc26..45be58257 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -2434,6 +2434,7 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap)); boolean_t need_global_flush = FALSE; uint32_t event_code; + vm_map_offset_t event_startv, event_endv; boolean_t is_ept = is_ept_pmap(pmap); assert((processor_avail_count < 2) || @@ -2441,14 +2442,20 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o if (pmap == kernel_pmap) { event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS); + event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv); + event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv); } else if (is_ept) { event_code = PMAP_CODE(PMAP__FLUSH_EPT); + event_startv = startv; + event_endv = endv; } else { event_code = PMAP_CODE(PMAP__FLUSH_TLBS); + event_startv = startv; + event_endv = endv; } PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START, - pmap, options, startv, endv, 0); + VM_KERNEL_UNSLIDE_OR_PERM(pmap), options, event_startv, event_endv, 0); if (is_ept) { mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp); @@ -2574,7 +2581,7 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o continue; PMAP_TRACE_CONSTANT( PMAP_CODE(PMAP__FLUSH_TLBS_TO), - pmap, cpus_to_signal, cpus_to_respond, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, cpus_to_respond, 0, 0); is_timeout_traced = TRUE; continue; } @@ -2595,7 +2602,7 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int o out: PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END, - pmap, cpus_to_signal, startv, endv, 0); + VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal, event_startv, event_endv, 0); } diff --git a/security/mac_base.c b/security/mac_base.c index d75117bc8..7f147cff1 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -364,6 +364,7 @@ mac_policy_init(void) mac_policy_list.chunks = 1; mac_policy_list.entries = kalloc(sizeof(struct mac_policy_list_element) * MAC_POLICY_LIST_CHUNKSIZE); + bzero(mac_policy_list.entries, sizeof(struct mac_policy_list_element) * MAC_POLICY_LIST_CHUNKSIZE); LIST_INIT(&mac_label_element_list); diff --git a/tools/lldbmacros/ioreg.py b/tools/lldbmacros/ioreg.py index 7f079563c..26cb4b64a 100644 --- a/tools/lldbmacros/ioreg.py +++ b/tools/lldbmacros/ioreg.py @@ -109,9 +109,10 @@ def GetRegistryEntrySummary(entry): vtableAddr = dereference(Cast(entry, 'uintptr_t *')) - 2 * sizeof('uintptr_t *') vtype = kern.SymbolicateFromAddress(vtableAddr) if vtype is None or len(vtype) < 1: - out_string += " is fixed" - # FIXME: Uncomment me when is fixed - #if numbytes == 1: - # result = dereference(Cast(result_pkt.data, 'uint8_t *')) - #elif numbytes == 2: - # result = dereference(Cast(result_pkt.data, 'uint16_t *')) - #elif numbytes == 4: - # result = dereference(cast(result_pkt.data, 'uint32_t *')) - - print "0x{0: <4x}: 0x{1: <1x}".format(addr, result) + if numbytes == 1: + result = dereference(Cast(addressof(result_pkt.data), 'uint8_t *')) + elif numbytes == 2: + result = dereference(Cast(addressof(result_pkt.data), 'uint16_t *')) + elif numbytes == 4: + result = dereference(Cast(addressof(result_pkt.data), 'uint32_t *')) + + print "{0: <#6x}: {1:#0{2}x}".format(addr, result, (numbytes*2)+2) def WriteIOPortInt(addr, numbytes, value, lcpu): """ Writes 'value' into ioport specified by 'addr'. Prints errors if it encounters any @@ -742,12 +741,12 @@ def WriteIOPortInt(addr, numbytes, value, lcpu): len_address = unsigned(addressof(kern.globals.manual_pkt.len)) data_address = unsigned(addressof(kern.globals.manual_pkt.data)) if not WriteInt32ToMemoryAddress(0, input_address): - print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) + print "error writing {0: #x} to port {1: <#6x}: failed to write 0 to input_address".format(value, addr) return kdp_pkt_size = GetType('kdp_writeioport_req_t').GetByteSize() if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address): - print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) + print "error writing {0: #x} to port {1: <#6x}: failed to write kdp_pkt_size".format(value, addr) return kgm_pkt = kern.GetValueFromAddress(data_address, 'kdp_writeioport_req_t *') @@ -759,29 +758,29 @@ def WriteIOPortInt(addr, numbytes, value, lcpu): WriteInt32ToMemoryAddress(numbytes, int(addressof(kgm_pkt.nbytes))) and WriteInt16ToMemoryAddress(lcpu, int(addressof(kgm_pkt.lcpu))) ): - print "This macro is incomplete till is fixed" - # FIXME: Uncomment me when is fixed - #if numbytes == 1: - # if not WriteInt8ToMemoryAddress(value, int(addressof(kgm_pkt.data))): - # print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) - #elif numbytes == 2: - # if not WriteInt16ToMemoryAddress(value, int(addressof(kgm_pkt.data))): - # print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) - #elif numbytes == 4: - # if not WriteInt32ToMemoryAddress(value, int(addressof(kgm_pkt.data))): - # print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) - + if numbytes == 1: + if not WriteInt8ToMemoryAddress(value, int(addressof(kgm_pkt.data))): + print "error writing {0: #x} to port {1: <#6x}: failed to write 8 bit data".format(value, addr) + return + elif numbytes == 2: + if not WriteInt16ToMemoryAddress(value, int(addressof(kgm_pkt.data))): + print "error writing {0: #x} to port {1: <#6x}: failed to write 16 bit data".format(value, addr) + return + elif numbytes == 4: + if not WriteInt32ToMemoryAddress(value, int(addressof(kgm_pkt.data))): + print "error writing {0: #x} to port {1: <#6x}: failed to write 32 bit data".format(value, addr) + return if not WriteInt32ToMemoryAddress(1, input_address): - print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) + print "error writing {0: #x} to port {1: <#6x}: failed to write to input_address".format(value, addr) return result_pkt = Cast(addressof(kern.globals.manual_pkt.data), 'kdp_writeioport_reply_t *') # Done with the write if(result_pkt.error == 0): - print "Writing 0x {0: x} to port {1: <4x} was successful".format(value, addr) + print "Writing {0: #x} to port {1: <#6x} was successful".format(value, addr) else: - print "error writing 0x{0: x} to port 0x{1: <4x}".format(value, addr) + print "error writing {0: #x} to port {1: <#6x}".format(value, addr) @lldb_command('showinterruptcounts') def showinterruptcounts(cmd_args=None): diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index 259a4a7ed..b5216e3e6 100644 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -761,6 +761,7 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): ] dsc_libs = [] + print "Shared cache UUID found from the binary data is <%s> " % str(dsc_common[0]) if dsc_common[0].replace('-', '').lower() == dsc_uuid: print "SUCCESS: Found Matching dyld shared cache uuid. Loading library load addresses from layout provided." _load_addr = dsc_common[1] @@ -833,6 +834,9 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr): for tid,thdata in thlist.iteritems(): threadByID[str(tid)] = {} thsnap = threadByID[str(tid)] + if "thread_snapshot_v2" not in thdata: + print "Found broken thread state for thread ID: %s." % tid + break threadsnap = thdata["thread_snapshot_v2"] thsnap["userTime"] = GetSecondsFromMATime(threadsnap["user_time"], timebase) thsnap["id"] = threadsnap["thread_id"] diff --git a/tools/lldbmacros/memory.py b/tools/lldbmacros/memory.py index b3d4dccc9..16604d864 100644 --- a/tools/lldbmacros/memory.py +++ b/tools/lldbmacros/memory.py @@ -1074,7 +1074,9 @@ def GetVMMapSummary(vmmap): vm_size = uint64_t(vmmap.size).value resident_pages = 0 if vmmap.pmap != 0: resident_pages = int(vmmap.pmap.stats.resident_count) - out_string += format_string.format(vmmap, vmmap.pmap, vm_size, vmmap.hdr.nentries, resident_pages, vmmap.hint, vmmap.first_free) + first_free = 0 + if int(vmmap.holelistenabled) == 0: first_free = vmmap.f_s.first_free + out_string += format_string.format(vmmap, vmmap.pmap, vm_size, vmmap.hdr.nentries, resident_pages, vmmap.hint, first_free) return out_string @lldb_type_summary(['vm_map_entry']) diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index b4c85f918..e2ddb8e3d 100644 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -1026,6 +1026,21 @@ def DumpCallQueue(cmd_args=None): #EndMacro: dumpcallqueue +@lldb_command('showalltasklogicalwrites') +def ShowAllTaskIOStats(cmd_args=None): + """ Commad to print I/O stats for all tasks + """ + print "{0: <20s} {1: <20s} {2: <20s} {3: <20s} {4: <20s} {5: <20s}".format("task", "Immediate Writes", "Deferred Writes", "Invalidated Writes", "Metadata Writes", "name") + for t in kern.tasks: + pval = Cast(t.bsd_info, 'proc *') + print "{0: <#18x} {1: >20d} {2: >20d} {3: >20d} {4: >20d} {5: <20s}".format(t, + t.task_immediate_writes, + t.task_deferred_writes, + t.task_invalidated_writes, + t.task_metadata_writes, + str(pval.p_comm)) + + @lldb_command('showalltasks','C') def ShowAllTasks(cmd_args=None, cmd_options={}): """ Routine to print a summary listing of all the tasks @@ -1236,7 +1251,7 @@ def SwitchToRegs(cmd_args=None): fake_thread_id = 0xdead0000 | (saved_state & ~0xffff0000) fake_thread_id = fake_thread_id & 0xdeadffff lldb_process.CreateOSPluginThread(0xdeadbeef, saved_state) - lldbthread = lldb_process.GetThreadByID(fake_thread_id) + lldbthread = lldb_process.GetThreadByID(int(fake_thread_id)) if not lldbthread.IsValid(): print "Failed to create thread" diff --git a/tools/tests/MPMMTest/KQMPMMtest.c b/tools/tests/MPMMTest/KQMPMMtest.c index 635726b6d..0686350f3 100644 --- a/tools/tests/MPMMTest/KQMPMMtest.c +++ b/tools/tests/MPMMTest/KQMPMMtest.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include "../unit_tests/tests_common.h" #define MAX(A, B) ((A) < (B) ? (B) : (A)) @@ -68,6 +70,8 @@ static boolean_t timeshare = FALSE; static boolean_t threaded = FALSE; static boolean_t oneway = FALSE; static boolean_t do_select = FALSE; +static boolean_t save_perfdata = FALSE; + int msg_type; int num_ints; int num_msgs; @@ -98,6 +102,7 @@ void usage(const char *progname) { fprintf(stderr, " -work num\t\tmicroseconds of client work\n"); fprintf(stderr, " -pages num\t\tpages of memory touched by client work\n"); fprintf(stderr, " -select \t\tselect prior to calling kevent().\n"); + fprintf(stderr, " -perf \t\tCreate perfdata files for metrics.\n"); fprintf(stderr, "default values are:\n"); fprintf(stderr, " . no affinity\n"); fprintf(stderr, " . not timeshare\n"); @@ -200,6 +205,9 @@ void parse_args(int argc, char *argv[]) { } else if (0 == strcmp("-select", argv[0])) { do_select = TRUE; argc--; argv++; + } else if (0 == strcmp("-perf", argv[0])) { + save_perfdata = TRUE; + argc--; argv++; } else usage(progname); } @@ -733,6 +741,7 @@ wait_for_servers(void) exit(1); } + int main(int argc, char *argv[]) { int i; @@ -820,6 +829,10 @@ int main(int argc, char *argv[]) double dsecs = (double) deltatv.tv_sec + 1.0E-6 * (double) deltatv.tv_usec; + double time_in_sec = (double)deltatv.tv_sec + (double)deltatv.tv_usec/1000.0; + double throughput_msg_p_sec = (double) totalmsg/dsecs; + double avg_msg_latency = dsecs*1.0E6 / (double)totalmsg; + printf(" in %ld.%03u seconds\n", (long)deltatv.tv_sec, deltatv.tv_usec/1000); printf(" throughput in messages/sec: %g\n", @@ -827,6 +840,9 @@ int main(int argc, char *argv[]) printf(" average message latency (usec): %2.3g\n", dsecs * 1.0E6 / (double) totalmsg); + if (save_perfdata == TRUE) { + record_perf_data("kqmpmm_avg_msg_latency", "usec", avg_msg_latency, "Message latency measured in microseconds. Lower is better", stderr); + } return (0); } diff --git a/tools/tests/MPMMTest/MPMMtest.c b/tools/tests/MPMMTest/MPMMtest.c index c2991c37c..17b0a1acb 100644 --- a/tools/tests/MPMMTest/MPMMtest.c +++ b/tools/tests/MPMMTest/MPMMtest.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include "../unit_tests/tests_common.h" /* for record_perf_data() */ #include @@ -72,6 +74,7 @@ static boolean_t timeshare = FALSE; static boolean_t threaded = FALSE; static boolean_t oneway = FALSE; static boolean_t useset = FALSE; +static boolean_t save_perfdata = FALSE; int msg_type; int num_ints; int num_msgs; @@ -114,6 +117,7 @@ void usage(const char *progname) { fprintf(stderr, " -verbose\t\tbe verbose (use multiple times to increase verbosity)\n"); fprintf(stderr, " -oneway\t\tdo not request return reply\n"); fprintf(stderr, " -count num\t\tnumber of messages to send\n"); + fprintf(stderr, " -perf \t\tCreate perfdata files for metrics.\n"); fprintf(stderr, " -type trivial|inline|complex\ttype of messages to send\n"); fprintf(stderr, " -numints num\tnumber of 32-bit ints to send in messages\n"); fprintf(stderr, " -servers num\tnumber of server threads to run\n"); @@ -179,6 +183,9 @@ void parse_args(int argc, char *argv[]) { } else if (0 == strcmp("-oneway", argv[0])) { oneway = TRUE; argc--; argv++; + } else if (0 == strcmp("-perf", argv[0])) { + save_perfdata = TRUE; + argc--; argv++; } else if (0 == strcmp("-type", argv[0])) { if (argc < 2) usage(progname); @@ -940,6 +947,14 @@ int main(int argc, char *argv[]) printf(" average message latency (usec): %2.3g\n", dsecs * 1.0E6 / (double) totalmsg); + double time_in_sec = (double)deltatv.tv_sec + (double)deltatv.tv_usec/1000.0; + double throughput_msg_p_sec = (double) totalmsg/dsecs; + double avg_msg_latency = dsecs*1.0E6 / (double)totalmsg; + + if (save_perfdata == TRUE) { + record_perf_data("mpmm_avg_msg_latency", "usec", avg_msg_latency, "Message latency measured in microseconds. Lower is better", stderr); + } + if (stress_prepost) { int64_t sendns = abs_to_ns(g_client_send_time); dsecs = (double)sendns / (double)NSEC_PER_SEC; diff --git a/tools/tests/TLBcoherency/Makefile b/tools/tests/TLBcoherency/Makefile new file mode 100644 index 000000000..00bbf15d6 --- /dev/null +++ b/tools/tests/TLBcoherency/Makefile @@ -0,0 +1,27 @@ +include ../Makefile.common + +CC:=$(shell xcrun -sdk "$(SDKROOT)" -find cc) + +ifdef RC_ARCHS + ARCHS:=$(RC_ARCHS) + else + ifeq "$(Embedded)" "YES" + ARCHS:=armv7 armv7s arm64 armv7k + else + ARCHS:=x86_64 + endif +endif + +CFLAGS := -g $(patsubst %, -arch %, $(ARCHS)) -isysroot $(SDKROOT) -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders + +DSTROOT?=$(shell /bin/pwd) +SYMROOT?=$(shell /bin/pwd) + +DEBUG:=0 + +$(DSTROOT)/tlbcoh: TLBcoherency.c + $(CC) $(CFLAGS) -Wall TLBcoherency.c -o $(SYMROOT)/$(notdir $@) -DDEBUG=$(DEBUG) -g -Os + if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi + +clean: + rm -rf $(DSTROOT)/tlbcoh $(SYMROOT)/*.dSYM $(SYMROOT)/tlbcoh diff --git a/tools/tests/TLBcoherency/TLBcoherency.c b/tools/tests/TLBcoherency/TLBcoherency.c new file mode 100644 index 000000000..a4165baac --- /dev/null +++ b/tools/tests/TLBcoherency/TLBcoherency.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* A pool of threads which attempt to verify multiprocessor TLB coherency. + * Creates -t threads, by default 4 + * Creates -s separate mmap(MAP_ANON) R/W mappings, sized at 1 page each but + * alterable via -z + * Initially read-faults each mapping in, verifying first-word zerofill-- + * The kernel typically uses the physical aperture to perform the zerofill + * Writes map_address (page_aligned) | low 12 bits of the PID at the first word + * This can help verify ASID related inconsistencies + * Records a timestamp in a Structure associated with each mapping + * With a custom kernel, it has the option of creating a remapping of the page in + * the kernel's address space to exercise shared kernel mapping coherency. + * Each thread subsequently loops around on the set of mappings. One thread is designated + * the observer thread. The thread acquires a lock on the arena element, + * verifies that the mapping has the expected pattern (Address | PID), if the + * element is in the MAPPED state. Can optionally tell the kernel to check its + * alias as well. If it notices a mismatch, it has the option to issue a syscall + * to stop kernel tracing. If the -f option is supplied, the test is terminated. + * If the page has lingered beyond -l microseconds, non-observer threads will + * unmap the page, optionally calling into the kernel to unmap its alias, and + * repopulate the element. + * After this sequence, the thread will optionally usleep for -p microseconds, + * to allow for idle power management to engage if possible (errata might exist + * in those areas), or context switches to occur. + * Created Derek Kumar, 2011. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + OSSpinLock tlock; + uintptr_t taddr; + unsigned tstate; + uint64_t tctime; +} cpage; + +cpage *parray; + +#define ARENASIZE (1024) +#define NTHREADS (4) +#define PAGE_LINGER_TIME (2000000) +#define MAX_THREADS (512) +#define MYSYS (215) +#define CONSISTENCY(...) fprintf(stderr, __VA_ARGS__ ); + +unsigned arenasize = ARENASIZE, mapping_size; +uint64_t page_linger_time = PAGE_LINGER_TIME; +enum arenastates {MTOUCHED = 1, UNMAPPED = 2, MAPPED = 4, WP =8}; +enum syscaction {MDOMAP = 1, MDOUNMAP = 2, MDOCHECK = 4}; +enum ttypes {OBSERVER = 1, LOOPER = 2}; +bool trymode = true; +bool all_stop = false; +bool stop_on_failure = false; +bool reuse_addrs = true; +bool dosyscall = false; + +pid_t cpid; +int sleepus; + +pthread_t threads[MAX_THREADS]; +uint32_t roles[MAX_THREADS]; + +void usage(char **a) { + exit(1); +} + +void set_enable(int val) +{ + int mib[6]; + size_t needed; + + mib[0] = CTL_KERN; + mib[1] = KERN_KDEBUG; + mib[2] = KERN_KDENABLE; + mib[3] = val; + mib[4] = 0; + mib[5] = 0; + + if (sysctl(mib, 4, NULL, &needed, NULL, 0) < 0) { + printf("trace facility failure, KERN_KDENABLE\n"); + } +} + +void initialize_arena_element(int i) { + __unused int sysret; + void *hint = reuse_addrs ? (void *)0x1000 : NULL; + parray[i].taddr = (uintptr_t)mmap(hint, mapping_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); + + if (parray[i].taddr == (uintptr_t)MAP_FAILED) { + perror("mmap"); + exit(2); + } + +#if !defined(__LP64__) + uint32_t pattern = parray[i].taddr; + pattern |= cpid & 0xFFF; +// memset_pattern4((void *)parray[i].taddr, &pattern, PAGE_SIZE); // +// uncomment to fill the whole page, but a sufficiently unique first word +// gets the job done without slowing down the test + +#else + uint64_t pattern = parray[i].taddr; + pattern |= (cpid & 0xFFF); +// memset_pattern8(parray[i].taddr, &pattern, PAGE_SIZE); +#endif + + uint64_t val = (*(uintptr_t *)parray[i].taddr); + + if (val != 0) { + CONSISTENCY("Mismatch, actual: 0x%llx, expected: 0x%llx\n", (unsigned long long)val, 0ULL); + if (stop_on_failure) { + set_enable(0); + exit(5); + } + } + for (int k = 0; k < (mapping_size >> PAGE_SHIFT); k++) { + *(uintptr_t *)(parray[i].taddr + k * PAGE_SIZE) = pattern; + } + + parray[i].tctime = mach_absolute_time(); + parray[i].tstate = MTOUCHED; + + if (dosyscall) { + sysret = syscall(MYSYS, MDOMAP, parray[i].taddr, pattern, i, mapping_size); + } +} + +void initialize_arena(void) { + for (int i = 0; i < arenasize; i++) { + initialize_arena_element(i); + } +} + +void *tlbexerciser(void *targs) { + uint32_t role = *(uint32_t *)targs; + __unused int sysret; + printf("Starting thread %p, role: %u\n", pthread_self(), role); + + for(;;) { + for (int i = 0; i < arenasize; i++) { + if (all_stop) + return NULL; + + if (trymode) { + if (OSSpinLockTry(&parray[i].tlock) == false) + continue; + } else { + OSSpinLockLock(&parray[i].tlock); + } + + if (parray[i].tstate != UNMAPPED) { + uintptr_t ad; + ad = parray[i].taddr | (cpid & 0xFFF); + uintptr_t val = *(uintptr_t *)parray[i].taddr; + + if (val != ad) { + if (stop_on_failure) + all_stop = true; + syscall(180, 0x71BC0000, (ad >> 32), (ad & ~0), 0, 0, 0); + CONSISTENCY("Mismatch, actual: 0x%llx, expected: 0x%llx\n", (unsigned long long)val, (unsigned long long)ad); + if (stop_on_failure) { + set_enable(0); + exit(5); + } + } + + if (dosyscall) { + sysret = syscall(MYSYS, MDOCHECK, parray[i].taddr, ad, i, 0); + } + + if ((role != OBSERVER) && ((mach_absolute_time() - parray[i].tctime) > page_linger_time)) { + parray[i].tstate = UNMAPPED; + if (munmap((void *)parray[i].taddr, mapping_size) != 0) { + perror("munmap"); + } + + if (dosyscall) { + sysret = syscall(MYSYS, MDOUNMAP, parray[i].taddr, ad, i, mapping_size); + } + } + } else { + if (role != OBSERVER) { + initialize_arena_element(i); + } + } + + parray[i].tlock = 0; //unlock + + if (sleepus) + usleep(sleepus); + } + } + + return NULL; +} + +int main(int argc, char **argv) { + extern char *optarg; + int arg; + unsigned nthreads = NTHREADS; + + mapping_size = PAGE_SIZE; + + while ((arg = getopt(argc, argv, "l:t:h:s:p:z:fry")) != -1) { + switch (arg) { + case 'l': + page_linger_time = strtoull(optarg, NULL, 0); + break; + case 't': + nthreads = atoi(optarg); + break; + case 's': + arenasize = atoi(optarg); // we typically want this to + // be sized < 2nd level TLB + break; + case 'f': + stop_on_failure = true; + break; + case 'r': + reuse_addrs = false; + break; + case 'p': + sleepus = atoi(optarg); + break; + case 'y': + dosyscall = true; + break; + case 'z': + mapping_size = atoi(optarg) * PAGE_SIZE; + break; + case 'h': + usage(argv); + } + } + + if(optind != argc) { + usage(argv); + } + + printf("page_linger_time: 0x%llx, nthreads: %u, arenasize: %u sleepus: %d reuse_addrs: %u, stop_on_failure: %u, dosyscall: %u, mappingsize: 0x%x\n", page_linger_time, nthreads, arenasize, sleepus, reuse_addrs, (unsigned) stop_on_failure, dosyscall, mapping_size); + + parray = calloc(arenasize, sizeof(cpage)); + cpid = getpid(); + + initialize_arena(); + + for (int dex = 0; dex < nthreads; dex++) { + roles[dex] = LOOPER; + if (dex == 0) + roles[dex] = OBSERVER; + int result = pthread_create(&threads[dex], NULL, tlbexerciser, &roles[dex]); + if(result) { + printf("pthread_create: %d starting worker thread; aborting.\n", result); + return result; + } + } + + for(int dex = 0; dex < nthreads; dex++) { + void *rtn; + int result = pthread_join(threads[dex], &rtn); + + if(result) { + printf("pthread_join(): %d, aborting\n", result); + return result; + } + + if(rtn) { + printf("***Aborting on worker error\n"); + exit(1); + } + } + return 0; +} diff --git a/tools/tests/execperf/test.sh b/tools/tests/execperf/test.sh index 7a8f31650..c9a940bd2 100755 --- a/tools/tests/execperf/test.sh +++ b/tools/tests/execperf/test.sh @@ -9,6 +9,20 @@ RUN=run PRODUCT=`sw_vers -productName` COUNT= +# params are: record_perf_data(metric, unit, value, description) +function record_perf_data() { + local METRIC=$1 + local UNIT=$2 + local DATA=$3 + local DESCRIPTION=$4 + echo "{ \"version\" : \"1.0\", \"measurements\" : {\"$METRIC\": {\"description\" : \"$DESCRIPTION\", \"names\":[\"$METRIC\"], \"units\" : [\"$UNIT\"], \"data\" : [$DATA] }}}" +} + +PERFDATA_DIR=$BATS_TMP_DIR +if [ "${PERFDATA_DIR}" == "" ]; then + PERFDATA_DIR=/tmp/ +fi + case "$PRODUCT" in "iPhone OS") COUNT=1000 @@ -22,7 +36,13 @@ for i in ${EXECUTABLES}; do echo "Running $i" for j in `jot $(sysctl -n hw.ncpu) 1`; do printf "\t%dx\t" $j - /usr/bin/time ./${RUN} $j $((${COUNT}/$j)) ./$i + METRIC_NAME="${i}_${j}x" + TIMEOUT=` /usr/bin/time ./${RUN} $j $((${COUNT}/$j)) ./$i 2>&1` + echo ${TIMEOUT} + REALTIME=`echo ${TIMEOUT} | awk '{ print $1 }'` + TOTALTIME=`echo ${TIMEOUT} | awk '{ print $3 + $5 }'` + record_perf_data "${METRIC_NAME}_real" "s" $REALTIME "Real time in seconds. Lower is better. This may have variance based on load on system" > ${PERFDATA_DIR}/${METRIC_NAME}_real.perfdata + record_perf_data "${METRIC_NAME}_sys" "s" $TOTALTIME "User + Sys time in seconds. Lower is better." > /tmp/${METRIC_NAME}_sys.perfdata if [ $? -ne 0 ]; then echo "Failed $i, exit status $?" exit 1