diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index b8277bc62..fd49201b7 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * Copyright (c) 1999-2008 Apple Inc. All Rights Reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,6 +192,10 @@ struct mach_header_64 { the static linker does not need to examine dependent dylibs to see if any are re-exported */ +#define MH_PIE 0x200000 /* When this bit is set, the OS will + load the main executable at a + random address. Only used in + MH_EXECUTE filetypes. */ /* * The load commands directly follow the mach_header. The total size of all @@ -266,6 +270,8 @@ struct load_command { #define LC_CODE_SIGNATURE 0x1d /* local of code signature */ #define LC_SEGMENT_SPLIT_INFO 0x1e /* local of info to split segments */ #define LC_REEXPORT_DYLIB (0x1f | LC_REQ_DYLD) /* load and re-export dylib */ +#define LC_LAZY_LOAD_DYLIB 0x20 /* delay load of dylib until first use */ +#define LC_ENCRYPTION_INFO 0x21 /* encrypted segment information */ /* * A variable length string in a load command is represented by an lc_str @@ -448,7 +454,13 @@ struct section_64 { /* for 64-bit architectures */ #define S_INTERPOSING 0xd /* section with only pairs of function pointers for interposing */ -#define S_16BYTE_LITERALS 0xe /* section with only 16 byte literals */ +#define S_16BYTE_LITERALS 0xe /* section with only 16 byte + literals */ +#define S_DTRACE_DOF 0xf /* section contains + DTrace Object Format */ +#define S_LAZY_DYLIB_SYMBOL_POINTERS 0x10 /* section with only lazy + symbol pointers to lazy + loaded dylibs */ /* * Constants for the section attributes part of the flags field of a section * structure. @@ -1107,6 +1119,19 @@ struct linkedit_data_command { uint32_t datasize; /* file size of data in __LINKEDIT segment */ }; +/* + * The encryption_info_command contains the file offset and size of an + * of an encrypted segment. + */ +struct encryption_info_command { + uint32_t cmd; /* LC_ENCRYPTION_INFO */ + uint32_t cmdsize; /* sizeof(struct encryption_info_command) */ + uint32_t cryptoff; /* file offset of encrypted range */ + uint32_t cryptsize; /* file size of encrypted range */ + uint32_t cryptid; /* which enryption system, + 0 means not-encrypted yet */ +}; + /* * The symseg_command contains the offset and size of the GNU style * symbol table information as described in the header file . diff --git a/README b/README index 76ea08c38..10f6d9084 100644 --- a/README +++ b/README @@ -15,25 +15,25 @@ A. How to build XNU: By default, architecture defaults to the build machine architecture, and the kernel configuration is set to build for DEVELOPMENT. - The machine configuration defaults to S5L8900XRB for arm and default for i386 and ppc. + The machine configuration defaults to S5L8900X for arm and default for i386 and ppc. This will also create a bootable image, mach_kernel, and a kernel binary with symbols, mach_kernel.sys. Examples: /* make a debug kernel for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900xrb" + make TARGET_CONFIGS="debug arm s5l8900x" - $(OBJROOT)/DEBUG_ARM_S5L8900XRB/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900XRB/mach_kernel: bootable image + $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image /* make debug and development kernels for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900xrb development arm s5l8900xrb" + make TARGET_CONFIGS="debug arm s5l8900x development arm s5l8900x" - $(OBJROOT)/DEBUG_ARM_S5L8900XRB/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900XRB/mach_kernel: bootable image - $(OBJROOT)/DEVELOPMENT_ARM/osfmk/DEVELOPMENT/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEVELOPMENT_ARM/mach_kernel: bootable image + $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image + $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/osfmk/DEVELOPMENT/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/mach_kernel: bootable image /* this is all you need to do to build H1 arm with DEVELOPMENT kernel configuration */ make TARGET_CONFIGS="default arm default" diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index 8e308ece2..5419f96bb 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -165,6 +165,9 @@ options CONFIG_SOWUPCALL # SB_UPCALL on sowwakeup # options CONFIG_FORCE_OUT_IFP # Force IP output to use an interface # options CONFIG_MBUF_NOEXPAND # limit mbuf expansion # options CONFIG_MBUF_JUMBO # jumbo cluster pool # +options CONFIG_IP_EDGEHOLE # Drop tagged packets at EDGE interface # + +options CONFIG_WORKQUEUE # # # 4.4 filesystems @@ -278,7 +281,7 @@ options CONFIG_VNODES=263168 # options CONFIG_VNODES=263168 # options CONFIG_VNODES=10240 # options CONFIG_VNODES=1024 # -options CONFIG_VNODES=512 # +options CONFIG_VNODES=640 # options CONFIG_VNODE_FREE_MIN=500 # options CONFIG_VNODE_FREE_MIN=300 # @@ -324,6 +327,11 @@ options CONFIG_NMBCLUSTERS="((1024 * 1024) / MCLBYTES)" # options CONFIG_NMBCLUSTERS="((1024 * 512) / MCLBYTES)" # options CONFIG_NMBCLUSTERS="((1024 * 256) / MCLBYTES)" # +# set maximum space used for packet buffers +# +options CONFIG_USESOCKTHRESHOLD=1 # +options CONFIG_USESOCKTHRESHOLD=0 # + # # Configure size of TCP hash table # @@ -388,6 +396,13 @@ options CONFIG_NO_KPRINTF_STRINGS # # options CONFIG_EMBEDDED # +# +# code decryption... used on embedded for app protection +# must be set in all the bsd/conf and osfmk/conf MASTER files +# +options CONFIG_CODE_DECRYPTION # + + # # Ethernet (ARP) # diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index a4504b8a8..c2ae93a18 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -44,7 +44,7 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] # FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo ffs union cd9660 config_volfs ] # NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert netat ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ] # NFS = [ nfsclient nfsserver ] @@ -53,7 +53,7 @@ # PROFILE = [ RELEASE profile ] # DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] # -# EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot ] +# EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] # EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] @@ -92,6 +92,12 @@ options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) # app-profiling i.e. pre-heating - off? options CONFIG_APP_PROFILE=0 +# +# code decryption... used on i386 for DSMOS +# must be set in all the bsd/conf and osfmk/conf MASTER files +# +options CONFIG_CODE_DECRYPTION + # # Ipl measurement system # diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc index 9f4a08d6d..b66984140 100644 --- a/bsd/conf/MASTER.ppc +++ b/bsd/conf/MASTER.ppc @@ -45,7 +45,7 @@ # Standard Apple Research Configurations: # -------- ----- -------- --------------- # -# BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot ] +# BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack compat_43_tty sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] # FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo ffs union cd9660 config_volfs ] # NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert netat ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ] # NFS = [ nfsclient nfsserver ] diff --git a/bsd/conf/files b/bsd/conf/files index 4f927bcba..0a7cfa9ae 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -12,6 +12,7 @@ OPTIONS/hw_ast optional hw_ast OPTIONS/hw_footprint optional hw_footprint OPTIONS/kernserv optional kernserv +OPTIONS/config_ip_edgehole optional config_ip_edgehole OPTIONS/config_macf optional config_macf OPTIONS/config_macf_socket_subset optional config_macf_socket_subset OPTIONS/config_macf_socket optional config_macf_socket @@ -262,6 +263,7 @@ bsd/netinet/ip_id.c optional randomipid inet bsd/netinet/ip_input.c optional inet bsd/netinet/ip_mroute.c optional mrouting bsd/netinet/ip_output.c optional inet +bsd/netinet/ip_edgehole.c optional config_ip_edgehole bsd/netinet/raw_ip.c optional inet bsd/netinet/tcp_debug.c optional tcpdebug bsd/netinet/tcp_input.c optional inet diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index c28ae6f0b..eebffddbb 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -16549,7 +16549,7 @@ dtrace_init( void ) * XXX Warn if state is LAZY_OFF? It won't break anything, but * makes no sense... */ - if (!PE_parse_boot_arg("dtrace_dof_mode", &dtrace_dof_mode)) { + if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) { dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON; } diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index f1b85f42f..311e4d6cf 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -421,7 +421,7 @@ void fbt_init( void ) { - PE_parse_boot_arg("DisableFBT", &gDisableFBT); + PE_parse_boot_argn("DisableFBT", &gDisableFBT, sizeof (gDisableFBT)); if (0 == gDisableFBT) { diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 219224f80..b2b021280 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -142,7 +142,6 @@ static const char * critical_blacklist[] = "pmap_cpu_high_map_vaddr", "pmap_cpu_high_shared_remap", "pmap_cpu_init", - "rdHPET", "register_cpu_setup_func", "unregister_cpu_setup_func" }; @@ -405,7 +404,7 @@ __fbt_provide_module(void *arg, struct modctl *ctl) unsigned int i, j; int gIgnoreFBTBlacklist = 0; - PE_parse_boot_arg("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist); + PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); mh = (struct mach_header *)(ctl->address); modname = ctl->mod_modname; @@ -527,17 +526,12 @@ __fbt_provide_module(void *arg, struct modctl *ctl) */ if (strstr(name, "machine_stack_") == name || strstr(name, "mapping_") == name || - strstr(name, "hpet_") == name || - - 0 == strcmp(name, "rdHPET") || - 0 == strcmp(name, "HPETInterrupt") || 0 == strcmp(name, "tmrCvt") || strstr(name, "tsc_") == name || strstr(name, "pmCPU") == name || 0 == strcmp(name, "Cstate_table_set") || - 0 == strcmp(name, "pmHPETInterrupt") || 0 == strcmp(name, "pmKextRegister") || 0 == strcmp(name, "pmSafeMode") || 0 == strcmp(name, "pmUnregister") || diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index c7b04d296..2637c2654 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -31,6 +31,7 @@ #include #include #include +#include static int hw_cpu_sysctl SYSCTL_HANDLER_ARGS @@ -166,6 +167,12 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package, sizeof(uint32_t), hw_cpu_sysctl, "I", "CPU cores per package"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, microcode_version, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_microcode_version), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Microcode version number"); + SYSCTL_NODE(_machdep_cpu, OID_AUTO, mwait, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "mwait"); @@ -291,6 +298,34 @@ SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, size, hw_cpu_sysctl, "I", "Cache size (in Kbytes)"); +SYSCTL_NODE(_machdep_cpu, OID_AUTO, tlb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "tlb"); + +SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, inst_small, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_itlb_small), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of small page instruction TLBs"); + +SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, data_small, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_dtlb_small), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of small page data TLBs"); + +SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, inst_large, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_itlb_large), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of large page instruction TLBs"); + +SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, data_large, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, cpuid_dtlb_large), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of large page data TLBs"); + + SYSCTL_NODE(_machdep_cpu, OID_AUTO, address_bits, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "address_bits"); @@ -306,6 +341,19 @@ SYSCTL_PROC(_machdep_cpu_address_bits, OID_AUTO, virtual, sizeof(uint32_t), hw_cpu_sysctl, "I", "Number of virtual address bits"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, core_count), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of enabled cores per package"); + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(i386_cpu_info_t, thread_count), + sizeof(uint32_t), + hw_cpu_sysctl, "I", "Number of enabled threads per package"); + + uint64_t pmap_pv_hashlist_walks; uint64_t pmap_pv_hashlist_cnts; uint32_t pmap_pv_hashlist_max; diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index fa6db0a3d..6b24ccf6c 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -61,7 +61,6 @@ extern void dtrace_systrace_syscall_return(unsigned short, int, int *); extern void unix_syscall(x86_saved_state_t *); extern void unix_syscall64(x86_saved_state_t *); extern void *find_user_regs(thread_t); -extern void throttle_lowpri_io(int *lowpri_window, mount_t v_mount); extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid); extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread); @@ -223,7 +222,7 @@ unix_syscall(x86_saved_state_t *state) */ syscall_exit_funnelcheck(); #endif /* DEBUG */ - if (uthread->uu_lowpri_window && uthread->v_mount) { + if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -231,7 +230,7 @@ unix_syscall(x86_saved_state_t *state) * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount); + throttle_lowpri_io(TRUE); } if (code != 180) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, @@ -398,7 +397,7 @@ unix_syscall64(x86_saved_state_t *state) */ syscall_exit_funnelcheck(); - if (uthread->uu_lowpri_window && uthread->v_mount) { + if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -406,7 +405,7 @@ unix_syscall64(x86_saved_state_t *state) * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount); + throttle_lowpri_io(TRUE); } if (code != 180) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, @@ -533,7 +532,7 @@ unix_syscall_return(int error) */ syscall_exit_funnelcheck(); - if (uthread->uu_lowpri_window && uthread->v_mount) { + if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -541,7 +540,7 @@ unix_syscall_return(int error) * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount); + throttle_lowpri_io(TRUE); } if (code != 180) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index f957be33c..5b4e005e7 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -108,9 +108,11 @@ static strategy_fcn_t mdevstrategy; static int mdevbioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); static int mdevcioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); static int mdevrw(dev_t dev, struct uio *uio, int ioflag); +#ifdef CONFIG_MEMDEV_INSECURE static char * nonspace(char *pos, char *end); static char * getspace(char *pos, char *end); static char * cvtnum(char *pos, char *end, unsigned int *num); +#endif /* CONFIG_MEMDEV_INSECURE */ extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes); extern void mapping_set_mod(ppnum_t pn); @@ -428,13 +430,14 @@ static int mdevsize(dev_t dev) { void mdevinit(__unused int the_cnt) { +#ifdef CONFIG_MEMDEV_INSECURE + int devid, phys; ppnum_t base; unsigned int size; char *ba, *lp; dev_t dev; - ba = PE_boot_args(); /* Get the boot arguments */ lp = ba + 256; /* Point to the end */ @@ -471,11 +474,13 @@ void mdevinit(__unused int the_cnt) { dev = mdevadd(devid, base >> 12, size >> 12, phys); /* Go add the device */ } - + +#endif /* CONFIG_MEMDEV_INSECURE */ return; } +#ifdef CONFIG_MEMDEV_INSECURE char *nonspace(char *pos, char *end) { /* Find next non-space in string */ if(pos >= end) return end; /* Don't go past end */ @@ -529,6 +534,7 @@ char *cvtnum(char *pos, char *end, unsigned int *num) { /* Convert to a number pos++; /* Step on */ } } +#endif /* CONFIG_MEMDEV_INSECURE */ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { diff --git a/bsd/dev/ppc/fbt_ppc.c b/bsd/dev/ppc/fbt_ppc.c index e2ffc4216..5ee9cea6a 100644 --- a/bsd/dev/ppc/fbt_ppc.c +++ b/bsd/dev/ppc/fbt_ppc.c @@ -333,8 +333,8 @@ __fbt_provide_module(void *arg, struct modctl *ctl) char *modname; unsigned int i; - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_arg("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist); + int gIgnoreFBTBlacklist = 0; + PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); mh = (struct mach_header *)(ctl->address); modname = ctl->mod_modname; diff --git a/bsd/dev/ppc/systemcalls.c b/bsd/dev/ppc/systemcalls.c index 11f9eb9c4..8cc3ca4b8 100644 --- a/bsd/dev/ppc/systemcalls.c +++ b/bsd/dev/ppc/systemcalls.c @@ -69,7 +69,6 @@ find_user_regs( thread_t act); extern lck_spin_t * tz_slock; -extern void throttle_lowpri_io(int *lowpri_window, mount_t v_mount); /* * Function: unix_syscall @@ -261,7 +260,7 @@ unix_syscall(struct savearea *regs) /* panic if funnel is held */ syscall_exit_funnelcheck(); - if (uthread->uu_lowpri_window && uthread->v_mount) { + if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -269,7 +268,7 @@ unix_syscall(struct savearea *regs) * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount); + throttle_lowpri_io(TRUE); } if (kdebug_enable && (code != 180)) { @@ -373,7 +372,7 @@ unix_syscall_return(int error) /* panic if funnel is held */ syscall_exit_funnelcheck(); - if (uthread->uu_lowpri_window && uthread->v_mount) { + if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -381,7 +380,7 @@ unix_syscall_return(int error) * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&uthread->uu_lowpri_window,uthread->v_mount); + throttle_lowpri_io(TRUE); } if (kdebug_enable && (code != 180)) { if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index d3df77196..d2dd20b11 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -145,6 +145,11 @@ bsd_startupearly(void) #if SOCKETS { +#if CONFIG_USESOCKTHRESHOLD + static const unsigned int maxspace = 64 * 1024; +#else + static const unsigned int maxspace = 128 * 1024; +#endif int scale; nmbclusters = bsd_mbuf_cluster_reserve() / MCLBYTES; @@ -154,10 +159,10 @@ bsd_startupearly(void) tcp_sendspace *= scale; tcp_recvspace *= scale; - if (tcp_sendspace > (64 * 1024)) - tcp_sendspace = 64 * 1024; - if (tcp_recvspace > (64 * 1024)) - tcp_recvspace = 64 * 1024; + if (tcp_sendspace > maxspace) + tcp_sendspace = maxspace; + if (tcp_recvspace > maxspace) + tcp_recvspace = maxspace; } #endif /* INET || INET6 */ } diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 5568cf9c5..cfed9e65d 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -124,9 +124,11 @@ typedef struct hfsmount { u_int32_t hfs_flags; /* see below */ /* Physical Description */ - u_long hfs_phys_block_size; /* Always a multiple of 512 */ - daddr64_t hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */ - daddr64_t hfs_alt_id_sector; /* location of alternate VH/MDB */ + u_int32_t hfs_logical_block_size; /* Logical block size of the disk as reported by ioctl(DKIOCGETBLOCKSIZE), always a multiple of 512 */ + daddr64_t hfs_logical_block_count; /* Number of logical blocks on the disk */ + daddr64_t hfs_alt_id_sector; /* location of alternate VH/MDB */ + u_int32_t hfs_physical_block_size; /* Physical block size of the disk as reported by ioctl(DKIOCGETPHYSICALBLOCKSIZE) */ + u_int32_t hfs_log_per_phys; /* Number of logical blocks per physical block size */ /* Access to VFS and devices */ struct mount *hfs_mp; /* filesystem vfs structure */ @@ -337,6 +339,8 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; #define HFS_FOLDERCOUNT 0x10000 /* When set, the file system exists on a virtual device, like disk image */ #define HFS_VIRTUAL_DEVICE 0x20000 +/* When set, we're in hfs_changefs, so hfs_sync should do nothing. */ +#define HFS_IN_CHANGEFS 0x40000 /* Macro to update next allocation block in the HFS mount structure. If @@ -457,6 +461,10 @@ enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; #define HFS_ALT_SECTOR(blksize, blkcnt) (((blkcnt) - 1) - (512 / (blksize))) #define HFS_ALT_OFFSET(blksize) ((blksize) > 1024 ? (blksize) - 1024 : 0) +/* Convert the logical sector number to be aligned on physical block size boundary. + * We are assuming the partition is a multiple of physical block size. + */ +#define HFS_PHYSBLK_ROUNDDOWN(sector_num, log_per_phys) ((sector_num / log_per_phys) * log_per_phys) /* * HFS specific fcntl()'s diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 04db5d290..6f190bce5 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -196,6 +196,7 @@ cat_convertattr( } else { /* Convert the data fork. */ datafp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + datafp->cf_new_size = 0; datafp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; if ((hfsmp->hfc_stage == HFC_RECORDING) && (attrp->ca_atime >= hfsmp->hfc_timebase)) { @@ -211,6 +212,7 @@ cat_convertattr( /* Convert the resource fork. */ rsrcfp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + rsrcfp->cf_new_size = 0; rsrcfp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; if ((hfsmp->hfc_stage == HFC_RECORDING) && (attrp->ca_atime >= hfsmp->hfc_timebase)) { @@ -686,6 +688,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files } else if (wantrsrc) { /* Convert the resource fork. */ forkp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + forkp->cf_new_size = 0; forkp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; if ((hfsmp->hfc_stage == HFC_RECORDING) && (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { @@ -704,6 +707,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files /* Convert the data fork. */ forkp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + forkp->cf_new_size = 0; forkp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; if ((hfsmp->hfc_stage == HFC_RECORDING) && (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { @@ -2177,7 +2181,7 @@ cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalog blksize = hfsmp->blockSize; blkcount = howmany(kHFSAliasSize, blksize); - sectorsize = hfsmp->hfs_phys_block_size; + sectorsize = hfsmp->hfs_logical_block_size; bzero(rsrcforkp, sizeof(HFSPlusForkData)); /* Allocate some disk space for the alias content. */ @@ -2193,7 +2197,7 @@ cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalog blkno = ((u_int64_t)rsrcforkp->extents[0].startBlock * (u_int64_t)blksize) / sectorsize; blkno += hfsmp->hfsPlusIOPosOffset / sectorsize; - bp = buf_getblk(hfsmp->hfs_devvp, blkno, roundup(kHFSAliasSize, hfsmp->hfs_phys_block_size), 0, 0, BLK_META); + bp = buf_getblk(hfsmp->hfs_devvp, blkno, roundup(kHFSAliasSize, hfsmp->hfs_logical_block_size), 0, 0, BLK_META); if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, bp); } diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h index 03c36567b..0c511ff67 100644 --- a/bsd/hfs/hfs_catalog.h +++ b/bsd/hfs/hfs_catalog.h @@ -113,9 +113,15 @@ struct cat_attr { * Catalog Node Fork (runtime) * * NOTE: this is not the same as a struct HFSPlusForkData + * + * NOTE: if cf_new_size > cf_size, then a write is in progress and is extending + * the EOF; the new EOF will be cf_new_size. Writes and pageouts may validly + * write up to cf_new_size, but reads should only read up to cf_size. When + * an extending write is not in progress, cf_new_size is zero. */ struct cat_fork { off_t cf_size; /* fork's logical size in bytes */ + off_t cf_new_size; /* fork's logical size after write completes */ union { u_int32_t cfu_clump; /* fork's clump size in bytes (sys files only) */ u_int64_t cfu_bytesread; /* bytes read from this fork */ diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index 7ff95e593..b0ba0f1ae 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -127,7 +127,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) */ if (v_type == VDIR) { hfs_reldirhints(cp, 0); - } + } if (cp->c_flag & C_HARDLINK) { hfs_relorigins(cp); @@ -474,10 +474,11 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) if (vnode_isdir(vp)) { hfs_reldirhints(cp, 0); } - + if (cp->c_flag & C_HARDLINK) { hfs_relorigins(cp); } + } /* Release the file fork and related data */ if (fp) { diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 7a5b13601..c4a930d16 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -61,6 +61,7 @@ typedef struct filefork filefork_t; /* Aliases for common fields */ #define ff_size ff_data.cf_size +#define ff_new_size ff_data.cf_new_size #define ff_clumpsize ff_data.cf_clump #define ff_bytesread ff_data.cf_bytesread #define ff_blocks ff_data.cf_blocks diff --git a/bsd/hfs/hfs_endian.c b/bsd/hfs/hfs_endian.c index e5775bfbc..db0f489d5 100644 --- a/bsd/hfs/hfs_endian.c +++ b/bsd/hfs/hfs_endian.c @@ -116,7 +116,9 @@ hfs_swap_BTNode ( /* * When first opening a BTree, we have to read the header node before the * control block is initialized. In this case, totalNodes will be zero, - * so skip the bounds checking. + * so skip the bounds checking. Also, we should ignore the header node when + * checking for invalid forwards and backwards links, since the header node's + * links can point back to itself legitimately. */ if (btcb->totalNodes != 0) { if (srcDesc->fLink >= btcb->totalNodes) { @@ -129,6 +131,20 @@ hfs_swap_BTNode ( error = fsBTInvalidHeaderErr; goto fail; } + + if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { + printf("hfs_swap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", + srcDesc->fLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { + printf("hfs_swap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", + srcDesc->bLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + } /* @@ -254,17 +270,34 @@ hfs_swap_BTNode ( if (direction == kSwapBTNodeHostToBig) { /* * Sanity check and swap the forward and backward links. + * Ignore the header node since its forward and backwards links can legitimately + * point to itself. */ if (srcDesc->fLink >= btcb->totalNodes) { panic("hfs_UNswap_BTNode: invalid forward link (0x%08X)\n", srcDesc->fLink); error = fsBTInvalidHeaderErr; goto fail; } + if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { + panic ("hfs_UNswap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", + srcDesc->fLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + if (srcDesc->bLink >= btcb->totalNodes) { panic("hfs_UNswap_BTNode: invalid backward link (0x%08X)\n", srcDesc->bLink); error = fsBTInvalidHeaderErr; goto fail; } + if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { + panic ("hfs_UNswap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", + srcDesc->bLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + + srcDesc->fLink = SWAP_BE32 (srcDesc->fLink); srcDesc->bLink = SWAP_BE32 (srcDesc->bLink); diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index 6009fb787..1e94b8fb8 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -352,9 +352,11 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int * process removed the object before we had a chance * to create the vnode, then just treat it as the not * found case above and return EJUSTRETURN. + * We should do the same for the RENAME operation since we are + * going to write it in regardless. */ if ((retval == ENOENT) && - (cnp->cn_nameiop == CREATE) && + ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && (flags & ISLASTCN)) { retval = EJUSTRETURN; } diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 958ca6e3a..1e836052f 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -479,20 +479,51 @@ hfs_vnop_write(struct vnop_write_args *ap) hfs_unlock(cp); cnode_locked = 0; + + /* + * We need to tell UBC the fork's new size BEFORE calling + * cluster_write, in case any of the new pages need to be + * paged out before cluster_write completes (which does happen + * in embedded systems due to extreme memory pressure). + * Similarly, we need to tell hfs_vnop_pageout what the new EOF + * will be, so that it can pass that on to cluster_pageout, and + * allow those pageouts. + * + * We don't update ff_size yet since we don't want pageins to + * be able to see uninitialized data between the old and new + * EOF, until cluster_write has completed and initialized that + * part of the file. + * + * The vnode pager relies on the file size last given to UBC via + * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or + * ff_size (whichever is larger). NOTE: ff_new_size is always + * zero, unless we are extending the file via write. + */ + if (filesize > fp->ff_size) { + fp->ff_new_size = filesize; + ubc_setsize(vp, filesize); + } retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, tail_off, lflag | IO_NOZERODIRTY); if (retval) { + fp->ff_new_size = 0; /* no longer extending; use ff_size */ + if (filesize > origFileSize) { + ubc_setsize(vp, origFileSize); + } goto ioerr_exit; } - offset = uio_offset(uio); - if (offset > fp->ff_size) { - fp->ff_size = offset; - - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + + if (filesize > origFileSize) { + fp->ff_size = filesize; + /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) + if (hfsmp->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; + } } + fp->ff_new_size = 0; /* ff_size now has the correct size */ + + /* If we wrote some bytes, then touch the change and mod times */ if (resid > uio_resid(uio)) { cp->c_touch_chgtime = TRUE; cp->c_touch_modtime = TRUE; @@ -2947,9 +2978,17 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) cp = VTOC(vp); fp = VTOF(vp); - if (vnode_isswap(vp)) { - filesize = fp->ff_size; - } else { + /* + * Figure out where the file ends, for pageout purposes. If + * ff_new_size > ff_size, then we're in the middle of extending the + * file via a write, so it is safe (and necessary) that we be able + * to pageout up to that point. + */ + filesize = fp->ff_size; + if (fp->ff_new_size > filesize) + filesize = fp->ff_new_size; + + if (!vnode_isswap(vp)) { off_t end_of_range; int tooklock = 0; @@ -2966,7 +3005,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) tooklock = 1; } - filesize = fp->ff_size; end_of_range = ap->a_f_offset + ap->a_size - 1; if (end_of_range >= filesize) { @@ -3219,7 +3257,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, retval = ENOSPC; goto restore; } else if ((eflags & kEFMetadataMask) && - ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) > + ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) > hfsmp->hfs_metazone_end)) { const char * filestr; char emptystr = '\0'; diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 7b67b6686..8eac4e20e 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -231,8 +231,11 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte if ((retval = hfs_flushfiles(mp, flags, p))) goto out; - hfsmp->hfs_flags |= HFS_READ_ONLY; + + /* mark the volume cleanly unmounted */ + hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask; retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + hfsmp->hfs_flags |= HFS_READ_ONLY; /* also get the volume bitmap blocks */ if (!retval) { @@ -275,11 +278,6 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte goto out; } - - retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); - if (retval != E_NONE) - goto out; - // If the journal was shut-down previously because we were // asked to be read-only, let's start it back up again now @@ -300,7 +298,7 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, hfsmp->jnl_size, hfsmp->hfs_devvp, - hfsmp->hfs_phys_block_size, + hfsmp->hfs_logical_block_size, jflags, 0, hfs_sync_metadata, hfsmp->hfs_mp); @@ -319,7 +317,14 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* Only clear HFS_READ_ONLY after a successfull write */ hfsmp->hfs_flags &= ~HFS_READ_ONLY; - if (!(hfsmp->hfs_flags & (HFS_READ_ONLY & HFS_STANDARD))) { + /* mark the volume dirty (clear clean unmount bit) */ + hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; + + retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + if (retval != E_NONE) + goto out; + + if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) { /* Setup private/hidden directories for hardlinks. */ hfs_privatedir_init(hfsmp, FILE_HARDLINKS); hfs_privatedir_init(hfsmp, DIR_HARDLINKS); @@ -439,6 +444,8 @@ hfs_changefs(struct mount *mp, struct hfs_mount_args *args) vcb = HFSTOVCB(hfsmp); mount_flags = (unsigned int)vfs_flags(mp); + hfsmp->hfs_flags |= HFS_IN_CHANGEFS; + permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) && ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) || (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) && @@ -447,7 +454,8 @@ hfs_changefs(struct mount *mp, struct hfs_mount_args *args) /* The root filesystem must operate with actual permissions: */ if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) { vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); /* Just say "No". */ - return EINVAL; + retval = EINVAL; + goto exit; } if (mount_flags & MNT_UNKNOWNPERMISSIONS) hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; @@ -555,6 +563,7 @@ hfs_changefs(struct mount *mp, struct hfs_mount_args *args) (void) hfs_relconverter(old_encoding); } exit: + hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS; return (retval); } @@ -626,7 +635,6 @@ hfs_reload(struct mount *mountp) { register struct vnode *devvp; struct buf *bp; - int sectorsize; int error, i; struct hfsmount *hfsmp; struct HFSPlusVolumeHeader *vhp; @@ -634,6 +642,7 @@ hfs_reload(struct mount *mountp) struct filefork *forkp; struct cat_desc cndesc; struct hfs_reload_cargs args; + daddr64_t priIDSector; hfsmp = VFSTOHFS(mountp); vcb = HFSTOVCB(hfsmp); @@ -665,18 +674,19 @@ hfs_reload(struct mount *mountp) /* * Re-read VolumeHeader from disk. */ - sectorsize = hfsmp->hfs_phys_block_size; + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); error = (int)buf_meta_bread(hfsmp->hfs_devvp, - (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + HFS_PRI_SECTOR(sectorsize)), - sectorsize, NOCRED, &bp); + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); if (error) { if (bp != NULL) buf_brelse(bp); return (error); } - vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); + vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); /* Do a quick sanity check */ if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && @@ -812,8 +822,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int mntwrapper; kauth_cred_t cred; u_int64_t disksize; - daddr64_t blkcnt; - u_int32_t blksize; + daddr64_t log_blkcnt; + u_int32_t log_blksize; + u_int32_t phys_blksize; u_int32_t minblksize; u_int32_t iswritable; daddr64_t mdb_offset; @@ -832,13 +843,25 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Advisory locking should be handled at the VFS layer */ vfs_setlocklocal(mp); - /* Get the real physical block size. */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, context)) { + /* Get the logical block size (treated as physical block size everywhere) */ + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) { retval = ENXIO; goto error_exit; } + /* Get the physical block size. */ + retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context); + if (retval) { + if ((retval != ENOTSUP) && (retval != ENOTTY)) { + retval = ENXIO; + goto error_exit; + } + /* If device does not support this ioctl, assume that physical + * block size is same as logical block size + */ + phys_blksize = log_blksize; + } /* Switch to 512 byte sectors (temporarily) */ - if (blksize > 512) { + if (log_blksize > 512) { u_int32_t size512 = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { @@ -847,15 +870,15 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } } /* Get the number of 512 byte physical blocks. */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { /* resetting block size may fail if getting block count did */ - (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context); + (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context); retval = ENXIO; goto error_exit; } /* Compute an accurate disk size (i.e. within 512 bytes) */ - disksize = (u_int64_t)blkcnt * (u_int64_t)512; + disksize = (u_int64_t)log_blkcnt * (u_int64_t)512; /* * On Tiger it is not necessary to switch the device @@ -863,18 +886,20 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, * worth of blocks but to insure compatibility with * pre-Tiger systems we have to do it. */ - if (blkcnt > 0x000000007fffffff) { - minblksize = blksize = 4096; + if (log_blkcnt > 0x000000007fffffff) { + minblksize = log_blksize = 4096; + if (phys_blksize < log_blksize) + phys_blksize = log_blksize; } /* Now switch to our preferred physical block size. */ - if (blksize > 512) { - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { + if (log_blksize > 512) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } /* Get the count of physical blocks. */ - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } @@ -882,16 +907,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* * At this point: * minblksize is the minimum physical block size - * blksize has our preferred physical block size - * blkcnt has the total number of physical blocks + * log_blksize has our preferred physical block size + * log_blkcnt has the total number of physical blocks */ - mdb_offset = (daddr64_t)HFS_PRI_SECTOR(blksize); - if ((retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp))) { + mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize); + if ((retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)), + phys_blksize, cred, &bp))) { goto error_exit; } MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK); - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, kMDBSize); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize); buf_brelse(bp); bp = NULL; @@ -912,8 +939,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_raw_dev = vnode_specrdev(devvp); hfsmp->hfs_devvp = devvp; vnode_ref(devvp); /* Hold a ref on the device, dropped when hfsmp is freed. */ - hfsmp->hfs_phys_block_size = blksize; - hfsmp->hfs_phys_block_count = blkcnt; + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_physical_block_size = phys_blksize; + hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize); hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; if (ronly) hfsmp->hfs_flags |= HFS_READ_ONLY; @@ -983,18 +1012,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, goto error_exit; } /* HFS disks can only use 512 byte physical blocks */ - if (blksize > kHFSBlockSize) { - blksize = kHFSBlockSize; - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { + if (log_blksize > kHFSBlockSize) { + log_blksize = kHFSBlockSize; + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } - hfsmp->hfs_phys_block_size = blksize; - hfsmp->hfs_phys_block_count = blkcnt; + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_logical_block_count = log_blkcnt; } if (args) { hfsmp->hfs_encoding = args->hfs_encoding; @@ -1030,37 +1059,38 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, * block size so everything will line up on a block * boundary. */ - if ((embeddedOffset % blksize) != 0) { + if ((embeddedOffset % log_blksize) != 0) { printf("HFS Mount: embedded volume offset not" " a multiple of physical block size (%d);" - " switching to 512\n", blksize); - blksize = 512; + " switching to 512\n", log_blksize); + log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, - (caddr_t)&blksize, FWRITE, context)) { + (caddr_t)&log_blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, - (caddr_t)&blkcnt, 0, context)) { + (caddr_t)&log_blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } /* Note: relative block count adjustment */ - hfsmp->hfs_phys_block_count *= - hfsmp->hfs_phys_block_size / blksize; - hfsmp->hfs_phys_block_size = blksize; + hfsmp->hfs_logical_block_count *= + hfsmp->hfs_logical_block_size / log_blksize; + hfsmp->hfs_logical_block_size = log_blksize; } disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) * (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz); - hfsmp->hfs_phys_block_count = disksize / blksize; + hfsmp->hfs_logical_block_count = disksize / log_blksize; - mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); - retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); + retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); if (retval) goto error_exit; - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, 512); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512); buf_brelse(bp); bp = NULL; vhp = (HFSPlusVolumeHeader*) mdbp; @@ -1119,13 +1149,15 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; if (mdb_offset == 0) { - mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); } bp = NULL; - retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); + retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blksize)); + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { printf ("hfs(1): Journal replay fail. Writing lastMountVersion as FSK!\n"); @@ -1170,22 +1202,22 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, * If the backend didn't like our physical blocksize * then retry with physical blocksize of 512. */ - if ((retval == ENXIO) && (blksize > 512) && (blksize != minblksize)) { + if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) { printf("HFS Mount: could not use physical block size " - "(%d) switching to 512\n", blksize); - blksize = 512; - if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, context)) { + "(%d) switching to 512\n", log_blksize); + log_blksize = 512; + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { retval = ENXIO; goto error_exit; } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { retval = ENXIO; goto error_exit; } - devvp->v_specsize = blksize; + devvp->v_specsize = log_blksize; /* Note: relative block count adjustment (in case this is an embedded volume). */ - hfsmp->hfs_phys_block_count *= hfsmp->hfs_phys_block_size / blksize; - hfsmp->hfs_phys_block_size = blksize; + hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; + hfsmp->hfs_logical_block_size = log_blksize; if (hfsmp->jnl) { // close and re-open this with the new block size @@ -1203,13 +1235,14 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; if (mdb_offset == 0) { - mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); } bp = NULL; - retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); + retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blksize)); + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { printf ("hfs(2): Journal replay fail. Writing lastMountVersion as FSK!\n"); @@ -1669,16 +1702,18 @@ hfs_sync_metadata(void *arg) struct hfsmount *hfsmp; ExtendedVCB *vcb; buf_t bp; - int sectorsize, retval; + int retval; daddr64_t priIDSector; hfsmp = VFSTOHFS(mp); vcb = HFSTOVCB(hfsmp); // now make sure the super block is flushed - sectorsize = hfsmp->hfs_phys_block_size; - priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_PRI_SECTOR(sectorsize)); - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); + + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); if ((retval != 0 ) && (retval != ENXIO)) { printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n", (int)priIDSector, retval); @@ -1695,7 +1730,9 @@ hfs_sync_metadata(void *arg) // hfs_btreeio.c:FlushAlternate() should flag when it was // written... if (hfsmp->hfs_alt_id_sector) { - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &bp); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { buf_bwrite(bp); } else if (bp) { @@ -1760,14 +1797,14 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) int error, allerror = 0; struct hfs_sync_cargs args; + hfsmp = VFSTOHFS(mp); + /* - * During MNT_UPDATE hfs_changefs might be manipulating - * vnodes so back off + * hfs_changefs might be manipulating vnodes so back off */ - if (((u_int32_t)vfs_flags(mp)) & MNT_UPDATE) /* XXX MNT_UPDATE may not be visible here */ + if (hfsmp->hfs_flags & HFS_IN_CHANGEFS) return (0); - hfsmp = VFSTOHFS(mp); if (hfsmp->hfs_flags & HFS_READ_ONLY) return (EROFS); @@ -2118,7 +2155,7 @@ hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, (off_t)((unsigned)name[3]), hfsmp->hfs_devvp, - hfsmp->hfs_phys_block_size, + hfsmp->hfs_logical_block_size, 0, 0, hfs_sync_metadata, hfsmp->hfs_mp); @@ -2675,7 +2712,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) int sectorsize; ByteCount namelen; - sectorsize = hfsmp->hfs_phys_block_size; + sectorsize = hfsmp->hfs_logical_block_size; retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp); if (retval) { if (bp) @@ -2774,7 +2811,6 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) int retval; struct buf *bp; int i; - int sectorsize; daddr64_t priIDSector; int critical; u_int16_t signature; @@ -2787,15 +2823,16 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) return hfs_flushMDB(hfsmp, waitfor, altflush); } critical = altflush; - sectorsize = hfsmp->hfs_phys_block_size; - priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / sectorsize) + - HFS_PRI_SECTOR(sectorsize)); + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); if (hfs_start_transaction(hfsmp) != 0) { return EINVAL; } - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); if (retval) { if (bp) buf_brelse(bp); @@ -2810,7 +2847,8 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) journal_modify_block_start(hfsmp->jnl, bp); } - volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); + volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) + + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); /* * Sanity check what we just read. @@ -2839,15 +2877,16 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) struct buf *bp2; HFSMasterDirectoryBlock *mdb; - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), - sectorsize, NOCRED, &bp2); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp2); if (retval) { if (bp2) buf_brelse(bp2); retval = 0; } else { mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) + - HFS_PRI_OFFSET(sectorsize)); + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate ) { @@ -2991,12 +3030,16 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) if (altflush && hfsmp->hfs_alt_id_sector) { struct buf *alt_bp = NULL; - if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &alt_bp) == 0) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) { if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, alt_bp); } - bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); + bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), + kMDBSize); if (hfsmp->jnl) { journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL); @@ -3048,6 +3091,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) u_int32_t addblks; u_int64_t sectorcnt; u_int32_t sectorsize; + u_int32_t phys_sectorsize; daddr64_t prev_alt_sector; daddr_t bitmapblks; int lockflags; @@ -3093,7 +3137,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§orsize, 0, context)) { return (ENXIO); } - if (sectorsize != hfsmp->hfs_phys_block_size) { + if (sectorsize != hfsmp->hfs_logical_block_size) { return (ENXIO); } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§orcnt, 0, context)) { @@ -3103,12 +3147,23 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) printf("hfs_extendfs: not enough space on device\n"); return (ENOSPC); } + error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sectorsize, 0, context); + if (error) { + if ((error != ENOTSUP) && (error != ENOTTY)) { + return (ENXIO); + } + /* If ioctl is not supported, force physical and logical sector size to be same */ + phys_sectorsize = sectorsize; + } + if (phys_sectorsize != hfsmp->hfs_physical_block_size) { + return (ENXIO); + } oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; /* * Validate new size. */ - if ((newsize <= oldsize) || (newsize % sectorsize)) { + if ((newsize <= oldsize) || (newsize % sectorsize) || (newsize % phys_sectorsize)) { printf("hfs_extendfs: invalid size\n"); return (EINVAL); } @@ -3261,14 +3316,14 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * Adjust file system variables for new space. */ - prev_phys_block_count = hfsmp->hfs_phys_block_count; + prev_phys_block_count = hfsmp->hfs_logical_block_count; prev_alt_sector = hfsmp->hfs_alt_id_sector; vcb->totalBlocks += addblks; vcb->freeBlocks += addblks; - hfsmp->hfs_phys_block_count = newsize / sectorsize; + hfsmp->hfs_logical_block_count = newsize / sectorsize; hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sectorsize) + - HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); + HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_logical_block_count); MarkVCBDirty(vcb); error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) { @@ -3290,7 +3345,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } vcb->totalBlocks -= addblks; vcb->freeBlocks -= addblks; - hfsmp->hfs_phys_block_count = prev_phys_block_count; + hfsmp->hfs_logical_block_count = prev_phys_block_count; hfsmp->hfs_alt_id_sector = prev_alt_sector; MarkVCBDirty(vcb); if (vcb->blockSize == 512) @@ -3304,11 +3359,12 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) */ bp = NULL; if (prev_alt_sector) { - if (buf_meta_bread(hfsmp->hfs_devvp, prev_alt_sector, sectorsize, - NOCRED, &bp) == 0) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) { journal_modify_block_start(hfsmp->jnl, bp); - bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); + bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize); journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); } else if (bp) { @@ -3402,7 +3458,9 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* Make sure new size is valid. */ if ((newsize < HFS_MIN_SIZE) || (newsize >= oldsize) || - (newsize % hfsmp->hfs_phys_block_size)) { + (newsize % hfsmp->hfs_logical_block_size) || + (newsize % hfsmp->hfs_physical_block_size)) { + printf ("hfs_truncatefs: invalid size\n"); error = EINVAL; goto out; } @@ -3502,10 +3560,11 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * since this block will be outside of the truncated file system! */ if (hfsmp->hfs_alt_id_sector) { - if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, - hfsmp->hfs_phys_block_size, NOCRED, &bp) == 0) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) { - bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_phys_block_size)), kMDBSize); + bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize); (void) VNOP_BWRITE(bp); } else if (bp) { buf_brelse(bp); @@ -3521,8 +3580,8 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * Adjust file system variables and flush them to disk. */ hfsmp->totalBlocks = newblkcnt; - hfsmp->hfs_phys_block_count = newsize / hfsmp->hfs_phys_block_size; - hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, hfsmp->hfs_phys_block_count); + hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size; + hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); MarkVCBDirty(hfsmp); error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) @@ -3632,7 +3691,7 @@ hfs_copy_extent( size_t ioSize; u_int32_t ioSizeSectors; /* Device sectors in this I/O */ daddr64_t srcSector, destSector; - u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_phys_block_size; + u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size; /* * Sanity check that we have locked the vnode of the file we're copying. @@ -3674,11 +3733,11 @@ hfs_copy_extent( buf_setdataptr(bp, (uintptr_t)buffer); resid = (off_t) blockCount * (off_t) hfsmp->blockSize; - srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_phys_block_size; - destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_phys_block_size; + srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; + destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; while (resid > 0) { ioSize = MIN(bufferSize, resid); - ioSizeSectors = ioSize / hfsmp->hfs_phys_block_size; + ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size; /* Prepare the buffer for reading */ buf_reset(bp, B_READ); @@ -3988,7 +4047,7 @@ hfs_journal_relocate_callback(void *_args) JournalInfoBlock *jibp; error = buf_meta_bread(hfsmp->hfs_devvp, - hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_phys_block_size), + hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), hfsmp->blockSize, vfs_context_ucred(args->context), &bp); if (error) { printf("hfs_reclaim_journal_file: failed to read JIB (%d)\n", error); @@ -4144,14 +4203,14 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) /* Copy the old journal info block content to the new location */ error = buf_meta_bread(hfsmp->hfs_devvp, - hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_phys_block_size), + hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), hfsmp->blockSize, vfs_context_ucred(context), &old_bp); if (error) { printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error); goto free_fail; } new_bp = buf_getblk(hfsmp->hfs_devvp, - newBlock * (hfsmp->blockSize/hfsmp->hfs_phys_block_size), + newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), hfsmp->blockSize, 0, 0, BLK_META); bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize); buf_brelse(old_bp); diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 736ab6199..43e5ae8be 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -146,11 +146,11 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, if (error || (utf8chars == 0)) (void) mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); - hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_phys_block_size); + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); vcb->vcbVBMIOSize = kHFSBlockSize; - hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, - hfsmp->hfs_phys_block_count); + hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, + hfsmp->hfs_logical_block_count); bzero(&cndesc, sizeof(cndesc)); cndesc.cd_parentcnid = kHFSRootParentID; @@ -330,11 +330,24 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, return (EINVAL); /* Make sure we can live with the physical block size. */ - if ((disksize & (hfsmp->hfs_phys_block_size - 1)) || - (embeddedOffset & (hfsmp->hfs_phys_block_size - 1)) || - (blockSize < hfsmp->hfs_phys_block_size)) { + if ((disksize & (hfsmp->hfs_logical_block_size - 1)) || + (embeddedOffset & (hfsmp->hfs_logical_block_size - 1)) || + (blockSize < hfsmp->hfs_logical_block_size)) { return (ENXIO); } + + /* If allocation block size is less than the physical + * block size, we assume that the physical block size + * is same as logical block size. The physical block + * size value is used to round down the offsets for + * reading and writing the primary and alternate volume + * headers at physical block boundary and will cause + * problems if it is less than the block size. + */ + if (blockSize < hfsmp->hfs_physical_block_size) { + hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; + } + /* * The VolumeHeader seems OK: transfer info from it into VCB * Note - the VCB starts out clear (all zeros) @@ -378,22 +391,22 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, * (currently set up from the wrapper MDB) using the * new blocksize value: */ - hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_phys_block_size); + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); vcb->vcbVBMIOSize = min(vcb->blockSize, MAXPHYSIO); /* * Validate and initialize the location of the alternate volume header. */ - spare_sectors = hfsmp->hfs_phys_block_count - + spare_sectors = hfsmp->hfs_logical_block_count - (((daddr64_t)vcb->totalBlocks * blockSize) / - hfsmp->hfs_phys_block_size); + hfsmp->hfs_logical_block_size); - if (spare_sectors > (blockSize / hfsmp->hfs_phys_block_size)) { + if (spare_sectors > (daddr64_t)(blockSize / hfsmp->hfs_logical_block_size)) { hfsmp->hfs_alt_id_sector = 0; /* partition has grown! */ } else { - hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size) + - HFS_ALT_SECTOR(hfsmp->hfs_phys_block_size, - hfsmp->hfs_phys_block_count); + hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, + hfsmp->hfs_logical_block_count); } bzero(&cndesc, sizeof(cndesc)); @@ -411,6 +424,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; cfork.cf_size = SWAP_BE64 (vhp->extentsFile.logicalSize); + cfork.cf_new_size= 0; cfork.cf_clump = SWAP_BE32 (vhp->extentsFile.clumpSize); cfork.cf_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); cfork.cf_vblocks = 0; @@ -607,9 +621,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, mdb_offset = (daddr64_t)((embeddedOffset / blockSize) + HFS_PRI_SECTOR(blockSize)); - retval = (int)buf_meta_bread(hfsmp->hfs_devvp, mdb_offset, blockSize, cred, &bp); + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, cred, &bp); if (retval == 0) { - jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(blockSize)); + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { printf ("hfs(3): Journal replay fail. Writing lastMountVersion as FSK!\n"); @@ -1760,7 +1776,8 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, JournalInfoBlock *jibp; struct buf *jinfo_bp, *bp; int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; - int retval, blksize = hfsmp->hfs_phys_block_size; + int retval; + uint32_t blksize = hfsmp->hfs_logical_block_size; struct vnode *devvp; struct hfs_mount_args *args = _args; u_int32_t jib_flags; @@ -1808,7 +1825,7 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, jib_offset + embeddedOffset, jib_size, devvp, - hfsmp->hfs_phys_block_size); + hfsmp->hfs_logical_block_size); hfsmp->jnl = NULL; @@ -1865,14 +1882,16 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, if (mdb_offset == 0) { mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); } - retval = (int)buf_meta_bread(devvp, mdb_offset, blksize, cred, &bp); + retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, cred, &bp); if (retval) { buf_brelse(bp); printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n", retval); return retval; } - bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(blksize), mdbp, 512); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size), mdbp, 512); buf_brelse(bp); bp = NULL; } @@ -1955,9 +1974,9 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a } - sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size; + sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_logical_block_size; retval = (int)buf_meta_bread(devvp, - (daddr64_t)(vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + + (daddr64_t)(vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size + (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp); if (retval) { @@ -2021,7 +2040,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a jib_offset + (off_t)vcb->hfsPlusIOPosOffset, jib_size, devvp, - hfsmp->hfs_phys_block_size); + hfsmp->hfs_logical_block_size); hfsmp->jnl = NULL; @@ -2042,7 +2061,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a jib_offset + (off_t)vcb->hfsPlusIOPosOffset, jib_size, devvp, - hfsmp->hfs_phys_block_size, + hfsmp->hfs_logical_block_size, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); @@ -2071,7 +2090,7 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a jib_offset + (off_t)vcb->hfsPlusIOPosOffset, jib_size, devvp, - hfsmp->hfs_phys_block_size, + hfsmp->hfs_logical_block_size, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index eef6b5e96..61875f626 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -2304,8 +2304,32 @@ hfs_vnop_rename(ap) error = hfs_lockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL, HFS_EXCLUSIVE_LOCK); if (error) { - if (took_trunc_lock) + if (took_trunc_lock) { hfs_unlock_truncate(VTOC(tvp), TRUE); + took_trunc_lock = 0; + } + /* + * tvp might no longer exist. if we get ENOENT, re-check the + * C_NOEXISTS flag on tvp to find out whether it's still in the + * namespace. + */ + if (error == ENOENT && tvp) { + /* + * It's okay to just check C_NOEXISTS without having a lock, + * because we have an iocount on it from the vfs layer so it can't + * have disappeared. + */ + if (VTOC(tvp)->c_flag & C_NOEXISTS) { + /* + * tvp is no longer in the namespace. Try again with NULL + * tvp/tcp (NULLing these out is fine because the vfs syscall + * will vnode_put the vnodes). + */ + tcp = NULL; + tvp = NULL; + goto retry; + } + } return (error); } @@ -2815,7 +2839,7 @@ hfs_vnop_symlink(struct vnop_symlink_args *ap) } /* Write the link to disk */ - bp = buf_getblk(vp, (daddr64_t)0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size), + bp = buf_getblk(vp, (daddr64_t)0, roundup((int)fp->ff_size, hfsmp->hfs_physical_block_size), 0, 0, BLK_META); if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, bp); @@ -3185,8 +3209,7 @@ hfs_vnop_readlink(ap) MALLOC(fp->ff_symlinkptr, char *, fp->ff_size, M_TEMP, M_WAITOK); error = (int)buf_meta_bread(vp, (daddr64_t)0, - roundup((int)fp->ff_size, - VTOHFS(vp)->hfs_phys_block_size), + roundup((int)fp->ff_size, VTOHFS(vp)->hfs_physical_block_size), vfs_context_ucred(ap->a_context), &bp); if (error) { if (bp) diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c index 86307fbcb..70108d229 100644 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ b/bsd/hfs/hfscommon/BTree/BTree.c @@ -230,8 +230,8 @@ OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc) btreePtr->fileRefNum = GetFileRefNumFromFCB(filePtr); filePtr->fcbBTCBPtr = (Ptr) btreePtr; // attach btree cb to file - /* The minimum node size is the physical block size */ - nodeRec.blockSize = VTOHFS(btreePtr->fileRefNum)->hfs_phys_block_size; + /* Prefer doing I/O a physical block at a time */ + nodeRec.blockSize = VTOHFS(btreePtr->fileRefNum)->hfs_physical_block_size; /* Start with the allocation block size for regular files. */ if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) @@ -301,8 +301,8 @@ OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc) // set kBadClose attribute bit, and UpdateNode - /* b-tree node size must be at least as big as the physical block size */ - if (btreePtr->nodeSize < nodeRec.blockSize) + /* b-tree node size must be at least as big as the logical block size */ + if (btreePtr->nodeSize < VTOHFS(btreePtr->fileRefNum)->hfs_logical_block_size) { /* * If this tree has any records or the media is writeable then diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c index 97e308497..9ac5c926f 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c +++ b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c @@ -584,11 +584,22 @@ static OSErr InsertNode (BTreeControlBlockPtr btreePtr, /////////////////////// Try Simple Insert /////////////////////////////// - if ( node == leftNodeNum ) - targetNode = leftNode; - else - targetNode = rightNode; - + /* sanity check our left and right nodes here. */ + if (node == leftNodeNum) { + if (leftNode->buffer == NULL) { + err = fsBTInvalidNodeErr; + M_ExitOnError(err); + } + else{ + targetNode = leftNode; + } + } + else { + // we can assume right node is initialized. + targetNode = rightNode; + } + + recordFit = InsertKeyRecord (btreePtr, targetNode->buffer, index, key->keyPtr, key->keyLength, key->recPtr, key->recSize); if ( recordFit ) @@ -605,7 +616,7 @@ static OSErr InsertNode (BTreeControlBlockPtr btreePtr, if ( !recordFit && leftNodeNum > 0 ) { - PanicIf ( leftNode->buffer != nil, "\p InsertNode: leftNode already aquired!"); + PanicIf ( leftNode->buffer != nil, "\p InsertNode: leftNode already acquired!"); if ( leftNode->buffer == nil ) { diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index a732366ca..718a87bdc 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -487,7 +487,7 @@ OSErr MapFileBlockC ( off_t tmpOff; allocBlockSize = vcb->blockSize; - sectorSize = VCBTOHFS(vcb)->hfs_phys_block_size; + sectorSize = VCBTOHFS(vcb)->hfs_logical_block_size; err = SearchExtentFile(vcb, fcb, offset, &foundKey, foundData, &foundIndex, &hint, &nextFABN); if (err == noErr) { diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index 0a999100e..be4d28c5e 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -86,6 +86,7 @@ Internal routines: #include #include #include +#include #include "../../hfs.h" #include "../../hfs_dbg.h" @@ -1177,6 +1178,7 @@ OSErr BlockMarkFree( u_int32_t wordsPerBlock; // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); + dk_discard_t discard; /* * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we @@ -1189,6 +1191,10 @@ OSErr BlockMarkFree( goto Exit; } + memset(&discard, 0, sizeof(dk_discard_t)); + discard.offset = (uint64_t)startingBlock * (uint64_t)vcb->blockSize; + discard.length = (uint64_t)numBlocks * (uint64_t)vcb->blockSize; + // // Pre-read the bitmap block containing the first word of allocation @@ -1313,6 +1319,12 @@ OSErr BlockMarkFree( if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); + if (err == noErr) { + // it doesn't matter if this fails, it's just informational anyway + VNOP_IOCTL(vcb->hfs_devvp, DKIOCDISCARD, (caddr_t)&discard, 0, vfs_context_kernel()); + } + + return err; Corruption: diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 50d955a3f..cf4ee656a 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -1015,29 +1015,29 @@ parse_bsd_args(void) char namep[16]; int msgbuf; - if (PE_parse_boot_arg("-s", namep)) + if (PE_parse_boot_argn("-s", namep, sizeof (namep))) boothowto |= RB_SINGLE; - if (PE_parse_boot_arg("-b", namep)) + if (PE_parse_boot_argn("-b", namep, sizeof (namep))) boothowto |= RB_NOBOOTRC; - if (PE_parse_boot_arg("-x", namep)) /* safe boot */ + if (PE_parse_boot_argn("-x", namep, sizeof (namep))) /* safe boot */ boothowto |= RB_SAFEBOOT; - if (PE_parse_boot_arg("-l", namep)) /* leaks logging */ + if (PE_parse_boot_argn("-l", namep, sizeof (namep))) /* leaks logging */ turn_on_log_leaks = 1; - PE_parse_boot_arg("srv", &srv); - PE_parse_boot_arg("ncl", &ncl); - if (PE_parse_boot_arg("nbuf", &max_nbuf_headers)) { + PE_parse_boot_argn("srv", &srv, sizeof (srv)); + PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); + if (PE_parse_boot_argn("nbuf", &max_nbuf_headers, sizeof (max_nbuf_headers))) { customnbuf = 1; } #if !defined(SECURE_KERNEL) - PE_parse_boot_arg("kmem", &setup_kmem); + PE_parse_boot_argn("kmem", &setup_kmem, sizeof (setup_kmem)); #endif - PE_parse_boot_arg("trace", &new_nkdbufs); + PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); - if (PE_parse_boot_arg("msgbuf", &msgbuf)) { + if (PE_parse_boot_argn("msgbuf", &msgbuf, sizeof (msgbuf))) { log_setsize(msgbuf); } } diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 657d9fde6..37a5afb34 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -311,7 +311,7 @@ issingleuser(void) { char namep[16]; - if (PE_parse_boot_arg("-s", namep)) { + if (PE_parse_boot_argn("-s", namep, sizeof(namep))) { return(1); } else { return(0); diff --git a/bsd/kern/init_sysent.c b/bsd/kern/init_sysent.c index 1f421780e..f736a0c8a 100644 --- a/bsd/kern/init_sysent.c +++ b/bsd/kern/init_sysent.c @@ -520,15 +520,26 @@ __private_extern__ struct sysent sysent[] = { {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 358 = nosys */ {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 359 = nosys */ #endif +#if CONFIG_WORKQUEUE {AC(bsdthread_create_args), 0, 0, (sy_call_t *)bsdthread_create, munge_wwwww, munge_ddddd, _SYSCALL_RET_ADDR_T, 20}, /* 360 = bsdthread_create */ {AC(bsdthread_terminate_args), 0, 0, (sy_call_t *)bsdthread_terminate, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T, 16}, /* 361 = bsdthread_terminate */ +#else + {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 360 = nosys */ + {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 361 = nosys */ +#endif {0, 0, 0, (sy_call_t *)kqueue, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 362 = kqueue */ {AC(kevent_args), 0, 0, (sy_call_t *)kevent, munge_wwwwww, munge_dddddd, _SYSCALL_RET_INT_T, 24}, /* 363 = kevent */ {AC(lchown_args), 0, 0, (sy_call_t *)lchown, munge_www, munge_ddd, _SYSCALL_RET_INT_T, 12}, /* 364 = lchown */ {AC(stack_snapshot_args), 0, 0, (sy_call_t *)stack_snapshot, munge_wwww, munge_dddd, _SYSCALL_RET_INT_T, 16}, /* 365 = stack_snapshot */ +#if CONFIG_WORKQUEUE {AC(bsdthread_register_args), 0, 0, (sy_call_t *)bsdthread_register, munge_www, munge_ddd, _SYSCALL_RET_INT_T, 12}, /* 366 = bsdthread_register */ {0, 0, 0, (sy_call_t *)workq_open, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 367 = workq_open */ {AC(workq_ops_args), 0, 0, (sy_call_t *)workq_ops, munge_www, munge_ddd, _SYSCALL_RET_INT_T, 12}, /* 368 = workq_ops */ +#else + {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 366 = nosys */ + {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 367 = nosys */ + {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 368 = nosys */ +#endif {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 369 = nosys */ {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 370 = nosys */ {0, 0, 0, (sy_call_t *)nosys, NULL, NULL, _SYSCALL_RET_INT_T, 0}, /* 371 = nosys */ diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 952d2b87c..737743535 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -402,7 +402,7 @@ kdbg_lock_init(void) /* get the number of cpus and cache it */ #define BSD_HOST 1 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - kd_cpus = hinfo.physical_cpu_max; + kd_cpus = hinfo.logical_cpu_max; if (kmem_alloc(kernel_map, (unsigned int *)&kdbip, sizeof(struct kd_bufinfo) * kd_cpus) != KERN_SUCCESS) diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index fcd359b61..70fb531b3 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -114,7 +114,11 @@ kern_return_t thread_getstatus(register thread_t act, int flavor, void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); +#ifdef SECURE_KERNEL +__private_extern__ int do_coredump = 0; /* default: don't dump cores */ +#else __private_extern__ int do_coredump = 1; /* default: dump cores */ +#endif __private_extern__ int sugid_coredump = 0; /* default: but not SGUID binaries */ void diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 0a67834ff..d7711096f 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -104,7 +104,7 @@ #include #include -#include +#include struct psemnode; struct pshmnode; @@ -142,6 +142,8 @@ extern int soo_stat(struct socket *so, void *ub, int isstat64); extern kauth_scope_t kauth_scope_fileop; +extern int cs_debug; + #define f_flag f_fglob->fg_flag #define f_type f_fglob->fg_type #define f_msgcount f_fglob->fg_msgcount @@ -1370,6 +1372,14 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) goto outdrop; } + if(ubc_cs_blob_get(vp, CPU_TYPE_ANY, fs.fs_file_start)) + { + if(cs_debug) + printf("CODE SIGNING: resident blob offered for: %s\n", vp->v_name); + vnode_put(vp); + goto outdrop; + } + #define CS_MAX_BLOB_SIZE (1ULL * 1024 * 1024) /* XXX ? */ if (fs.fs_blob_size > CS_MAX_BLOB_SIZE) { error = E2BIG; @@ -1378,9 +1388,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) } kernel_blob_size = CAST_DOWN(vm_size_t, fs.fs_blob_size); - kr = kmem_alloc(kernel_map, - &kernel_blob_addr, - kernel_blob_size); + kr = ubc_cs_blob_allocate(&kernel_blob_addr, &kernel_blob_size); if (kr != KERN_SUCCESS) { error = ENOMEM; vnode_put(vp); @@ -1391,9 +1399,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) (void *) kernel_blob_addr, kernel_blob_size); if (error) { - kmem_free(kernel_map, - kernel_blob_addr, - kernel_blob_size); + ubc_cs_blob_deallocate(kernel_blob_addr, + kernel_blob_size); vnode_put(vp); goto outdrop; } @@ -1405,11 +1412,10 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, register_t *retval) kernel_blob_addr, kernel_blob_size); if (error) { - kmem_free(kernel_map, - kernel_blob_addr, - kernel_blob_size); + ubc_cs_blob_deallocate(kernel_blob_addr, + kernel_blob_size); } else { - /* ubc_blob_add() was consumed "kernel_blob_addr" */ + /* ubc_blob_add() has consumed "kernel_blob_addr" */ } (void) vnode_put(vp); diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 6b2702d7b..43ab48894 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -893,9 +893,6 @@ exec_mach_imgact(struct image_params *imgp) imgp->ip_csflags |= CS_KILL; - /* load_machfile() maps the vnode */ - (void)ubc_map(imgp->ip_vp, PROT_READ | PROT_EXEC); - /* * Set up the system reserved areas in the new address space. */ @@ -919,8 +916,6 @@ exec_mach_imgact(struct image_params *imgp) */ error = exec_handle_sugid(imgp); - proc_knote(p, NOTE_EXEC); - if (!vfexec && (p->p_lflag & P_LTRACED)) psignal(p, SIGTRAP); @@ -928,6 +923,13 @@ exec_mach_imgact(struct image_params *imgp) goto badtoolate; } +#if CONFIG_MACF + /* Determine if the map will allow VM_PROT_COPY */ + error = mac_proc_check_map_prot_copy_allow(p); + vm_map_set_prot_copy_allow(get_task_map(task), + error ? FALSE : TRUE); +#endif + if (load_result.unixproc && create_unix_stack(get_task_map(task), load_result.user_stack, @@ -1127,6 +1129,8 @@ exec_mach_imgact(struct image_params *imgp) } badtoolate: + proc_knote(p, NOTE_EXEC); + if (vfexec) { task_deallocate(new_task); thread_deallocate(thread); @@ -1196,6 +1200,7 @@ exec_activate_image(struct image_params *imgp) int once = 1; /* save SGUID-ness for interpreted files */ int i; int iterlimit = EAI_ITERLIMIT; + proc_t p = vfs_context_proc(imgp->ip_vfs_context); error = execargs_alloc(imgp); if (error) @@ -1209,7 +1214,7 @@ exec_activate_image(struct image_params *imgp) */ error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg); if (error) { - goto bad; + goto bad_notrans; } DTRACE_PROC1(exec, uintptr_t, imgp->ip_strings); @@ -1220,10 +1225,12 @@ exec_activate_image(struct image_params *imgp) again: error = namei(&nd); if (error) - goto bad; + goto bad_notrans; imgp->ip_ndp = &nd; /* successful namei(); call nameidone() later */ imgp->ip_vp = nd.ni_vp; /* if set, need to vnode_put() at some point */ + proc_transstart(p, 0); + error = exec_check_permissions(imgp); if (error) goto bad; @@ -1292,6 +1299,7 @@ exec_activate_image(struct image_params *imgp) nd.ni_segflg = UIO_SYSSPACE32; nd.ni_dirp = CAST_USER_ADDR_T(imgp->ip_interp_name); + proc_transend(p, 0); goto again; default: @@ -1310,6 +1318,9 @@ exec_activate_image(struct image_params *imgp) } bad: + proc_transend(p, 0); + +bad_notrans: if (imgp->ip_strings) execargs_free(imgp); if (imgp->ip_ndp) @@ -1949,7 +1960,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, register_t *retval) if (!(uthread->uu_flag & UT_VFORK)) { if (task != kernel_task) { proc_lock(p); - numthreads = get_task_numacts(task); + numthreads = get_task_numactivethreads(task); if (numthreads <= 0 ) { proc_unlock(p); kauth_cred_unref(&context.vc_ucred); @@ -1974,9 +1985,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, register_t *retval) } #endif - proc_transstart(p, 0); error = exec_activate_image(imgp); - proc_transend(p, 0); kauth_cred_unref(&context.vc_ucred); @@ -2711,7 +2720,8 @@ exec_handle_sugid(struct image_params *imgp) */ p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), p->p_ucred->cr_gid); - /* XXX Obsolete; security token should not be separate from cred */ + /* Update the process' identity version and set the security token */ + p->p_idversion++; set_security_token(p); return(error); diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 27f98defb..a8e9ef7f0 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -418,6 +418,17 @@ proc_exit(proc_t p) */ fdfree(p); + if (uth->uu_lowpri_window) { + /* + * task is marked as a low priority I/O type + * and the I/O we issued while flushing files on close + * collided with normal I/O operations... + * no need to throttle this thread since its going away + * but we do need to update our bookeeping w/r to throttled threads + */ + throttle_lowpri_io(FALSE); + } + #if SYSV_SHM /* Close ref SYSV Shared memory*/ if (p->vm_shm) @@ -777,6 +788,15 @@ proc_exit(proc_t p) (void)reap_child_locked(pp, p, 1, 1, 1); /* list lock dropped by reap_child_locked */ } + if (uth->uu_lowpri_window) { + /* + * task is marked as a low priority I/O type and we've + * somehow picked up another throttle during exit processing... + * no need to throttle this thread since its going away + * but we do need to update our bookeeping w/r to throttled threads + */ + throttle_lowpri_io(FALSE); + } proc_rele(pp); diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index fa43edd2e..3bc45c1ce 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -850,7 +850,7 @@ proc_t forkproc(proc_t parent, int lock) { struct proc * child; /* Our new process */ - static int nextpid = 0, pidwrap = 0; + static int nextpid = 0, pidwrap = 0, nextpidversion = 0; int error = 0; struct session *sessp; uthread_t uth_parent = (uthread_t)get_bsdthread_info(current_thread()); @@ -926,6 +926,7 @@ forkproc(proc_t parent, int lock) } nprocs++; child->p_pid = nextpid; + child->p_idversion = nextpidversion++; #if 1 if (child->p_pid != 0) { if (pfind_locked(child->p_pid) != PROC_NULL) @@ -1183,6 +1184,17 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info) uthread_t uth = (uthread_t)uthread; proc_t p = (proc_t)bsd_info; + + if (uth->uu_lowpri_window) { + /* + * task is marked as a low priority I/O type + * and we've somehow managed to not dismiss the throttle + * through the normal exit paths back to user space... + * no need to throttle this thread since its going away + * but we do need to update our bookeeping w/r to throttled threads + */ + throttle_lowpri_io(FALSE); + } /* * Per-thread audit state should never last beyond system * call return. Since we don't audit the thread creation/ diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 78380edcf..912fdef3f 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -61,10 +61,15 @@ #include #include +extern unsigned int vm_page_free_count; +extern unsigned int vm_page_active_count; +extern unsigned int vm_page_inactive_count; +extern unsigned int vm_page_purgeable_count; +extern unsigned int vm_page_wire_count; + static void kern_memorystatus_thread(void); int kern_memorystatus_wakeup = 0; -int kern_memorystatus_pause = 0; int kern_memorystatus_level = 0; int kern_memorystatus_last_level = 0; unsigned int kern_memorystatus_kev_failure_count = 0; @@ -82,6 +87,13 @@ static void kern_memorystatus_thread(void) { struct kev_msg ev_msg; + struct { + uint32_t free_pages; + uint32_t active_pages; + uint32_t inactive_pages; + uint32_t purgeable_pages; + uint32_t wired_pages; + } data; int ret; while(1) { @@ -95,7 +107,15 @@ kern_memorystatus_thread(void) /* pass the memory status level in the event code (as percent used) */ ev_msg.event_code = 100 - kern_memorystatus_last_level; - ev_msg.dv[0].data_length = 0; + ev_msg.dv[0].data_length = sizeof data; + ev_msg.dv[0].data_ptr = &data; + ev_msg.dv[1].data_length = 0; + + data.free_pages = vm_page_free_count; + data.active_pages = vm_page_active_count; + data.inactive_pages = vm_page_inactive_count; + data.purgeable_pages = vm_page_purgeable_count; + data.wired_pages = vm_page_wire_count; ret = kev_post_msg(&ev_msg); if (ret) { @@ -103,9 +123,6 @@ kern_memorystatus_thread(void) printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); } - assert_wait_timeout((event_t)&kern_memorystatus_pause, THREAD_UNINT, 1, 250*1000*NSEC_PER_USEC); - (void)thread_block(THREAD_CONTINUE_NULL); - if (kern_memorystatus_level >= kern_memorystatus_last_level + 5 || kern_memorystatus_level <= kern_memorystatus_last_level - 5) continue; diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 0b8f8f024..01ac7e637 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -592,7 +592,7 @@ sysctl_mib_init(void) if (cpusubtype == CPU_SUBTYPE_POWERPC_970 && cpu_info.l2_cache_size == 1 * 1024 * 1024) /* The signature of the dual-core G5 */ - packages = hinfo.max_cpus / 2; + packages = roundup(hinfo.max_cpus, 2) / 2; else packages = hinfo.max_cpus; @@ -647,9 +647,9 @@ sysctl_mib_init(void) cachesize[4] = 0; /* hw.packages */ - packages = ml_cpu_cache_sharing(0) / - cpuid_info()->cpuid_cores_per_package; - + packages = roundup(ml_cpu_cache_sharing(0), cpuid_info()->thread_count) + / cpuid_info()->thread_count; + #else /* end __arm__ */ # warning we do not support this platform yet #endif /* __ppc__ */ diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 499d6f61b..9fa5bd2e0 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -191,7 +191,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) struct fileproc *fp; register struct vnode *vp; int flags; - int prot, file_prot; + int prot; int err=0; vm_map_t user_map; kern_return_t result; @@ -565,13 +565,6 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (void)vnode_put(vp); goto out; } - - file_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC); - if (docow) { - /* private mapping: won't write to the file */ - file_prot &= ~PROT_WRITE; - } - (void) ubc_map(vp, file_prot); } if (!mapanon) @@ -1231,7 +1224,6 @@ map_fd_funneled( } ubc_setthreadcred(vp, current_proc(), current_thread()); - (void)ubc_map(vp, (PROT_READ | PROT_EXEC)); (void)vnode_put(vp); err = 0; bad: diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index a3c81e332..9dd8f6ad1 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -155,7 +155,9 @@ lck_attr_t * lctx_lck_attr; static void lctxinit(void); #endif +#if DEBUG #define __PROC_INTERNAL_DEBUG 1 +#endif /* Name to give to core files */ __private_extern__ char corefilename[MAXPATHLEN+1] = {"/cores/core.%P"}; @@ -284,16 +286,22 @@ inferior(proc_t p) int isinferior(proc_t p, proc_t t) { -int retval = 0; + int retval = 0; + int nchecked = 0; + proc_t start = p; /* if p==t they are not inferior */ if (p == t) return(0); proc_list_lock(); - for (; p != t; p = p->p_pptr) - if (p->p_pid == 0) + for (; p != t; p = p->p_pptr) { + nchecked++; + + /* Detect here if we're in a cycle */ + if ((p->p_pid == 0) || (p->p_pptr == start) || (nchecked >= nprocs)) goto out; + } retval = 1; out: proc_list_unlock(); @@ -548,9 +556,9 @@ proc_childdrainend(proc_t p) } void -proc_checkdeadrefs(proc_t p) +proc_checkdeadrefs(__unused proc_t p) { -//#if __PROC_INTERNAL_DEBUG +#if __PROC_INTERNAL_DEBUG if ((p->p_listflag & P_LIST_INHASH) != 0) panic("proc being freed and still in hash %x: %x\n", (unsigned int)p, (unsigned int)p->p_listflag); if (p->p_childrencnt != 0) @@ -559,7 +567,7 @@ proc_checkdeadrefs(proc_t p) panic("proc being freed and pending refcount %x:%x\n", (unsigned int)p, (unsigned int)p->p_refcount); if (p->p_parentref != 0) panic("proc being freed and pending parentrefs %x:%x\n", (unsigned int)p, (unsigned int)p->p_parentref); -//#endif +#endif } int @@ -755,6 +763,18 @@ proc_is64bit(proc_t p) return(IS_64BIT_PROCESS(p)); } +int +proc_pidversion(proc_t p) +{ + return(p->p_idversion); +} + +int +proc_getcdhash(proc_t p, unsigned char *cdhash) +{ + return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash); +} + void bsd_set_dependency_capable(task_t task) { @@ -1705,7 +1725,6 @@ csops(__unused proc_t p, struct csops_args *uap, __unused register_t *retval) buf = (char *)kalloc(usize); if (buf == NULL) return(ENOMEM); - bzero(buf, usize); error = vnode_getwithvid(tvp, vid); @@ -2456,7 +2475,8 @@ SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW, &cs_force_hard, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW, &cs_debug, 0, ""); int -cs_invalid_page(void) +cs_invalid_page( + addr64_t vaddr) { struct proc *p; int retval; @@ -2475,48 +2495,41 @@ cs_invalid_page(void) if (cs_force_hard) p->p_csflags |= CS_HARD; - if (p->p_csflags & CS_VALID) { - p->p_csflags &= ~CS_VALID; - + /* CS_KILL triggers us to send a kill signal. Nothing else. */ + if (p->p_csflags & CS_KILL) { proc_unlock(p); - cs_procs_invalidated++; - printf("CODE SIGNING: cs_invalid_page: " - "p=%d[%s] clearing CS_VALID\n", - p->p_pid, p->p_comm); + if (cs_debug) { + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_KILL\n", + vaddr, p->p_pid, p->p_comm); + } + cs_procs_killed++; + psignal(p, SIGKILL); proc_lock(p); - - - if (p->p_csflags & CS_KILL) { - proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page: " - "p=%d[%s] honoring CS_KILL\n", - p->p_pid, p->p_comm); - } - cs_procs_killed++; - psignal(p, SIGKILL); - proc_lock(p); + } + + /* CS_HARD means fail the mapping operation so the process stays valid. */ + if (p->p_csflags & CS_HARD) { + proc_unlock(p); + if (cs_debug) { + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_HARD\n", + vaddr, p->p_pid, p->p_comm); } - - if (p->p_csflags & CS_HARD) { + retval = 1; + } else { + if (p->p_csflags & CS_VALID) { + p->p_csflags &= ~CS_VALID; + proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page: " - "p=%d[%s] honoring CS_HARD\n", - p->p_pid, p->p_comm); - } - retval = 1; + cs_procs_invalidated++; + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] clearing CS_VALID\n", + vaddr, p->p_pid, p->p_comm); } else { proc_unlock(p); - retval = 0; - } - } else { - proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page: " - "p=%d[%s] ignored...\n", - p->p_pid, p->p_comm); } + retval = 0; } diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 47c7983ca..46ab8bf1b 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -1986,7 +1986,7 @@ set_security_token(proc_t p) audit_token.val[4] = my_cred->cr_rgid; audit_token.val[5] = p->p_pid; audit_token.val[6] = my_cred->cr_au.ai_asid; - audit_token.val[7] = my_cred->cr_au.ai_termid.port; + audit_token.val[7] = p->p_idversion; #if CONFIG_MACF_MACH mac_task_label_update_cred(my_cred, p->task); diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 105ece427..b9b14ffb5 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -2709,7 +2709,7 @@ bsd_ast(thread_t thread) } -/* ptrace set runnalbe */ +/* ptrace set runnable */ void pt_setrunnable(proc_t p) { @@ -2723,7 +2723,9 @@ pt_setrunnable(proc_t p) proc_unlock(p); if (p->sigwait) { wakeup((caddr_t)&(p->sigwait)); - task_release(task); + if ((p->p_lflag & P_LSIGEXC) == 0) { // 5878479 + task_release(task); + } } } } diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 82f89c2de..b8673b4bc 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -607,7 +607,9 @@ kern_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, && !(name[0] == KERN_PROC || name[0] == KERN_PROF || name[0] == KERN_KDEBUG +#if !CONFIG_EMBEDDED || name[0] == KERN_PROCARGS +#endif || name[0] == KERN_PROCARGS2 || name[0] == KERN_IPC || name[0] == KERN_SYSV @@ -635,9 +637,11 @@ kern_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, #endif case KERN_KDEBUG: return (kdebug_ops(name + 1, namelen - 1, oldp, oldlenp, p)); +#if !CONFIG_EMBEDDED case KERN_PROCARGS: /* new one as it does not use kinfo_proc */ return (sysctl_procargs(name + 1, namelen - 1, oldp, oldlenp, p)); +#endif case KERN_PROCARGS2: /* new one as it does not use kinfo_proc */ return (sysctl_procargs2(name + 1, namelen - 1, oldp, oldlenp, p)); @@ -2224,6 +2228,9 @@ static int sysctl_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { +#ifdef SECURE_KERNEL + return (ENOTSUP); +#endif int new_value, changed; int error = sysctl_io_number(req, do_coredump, sizeof(int), &new_value, &changed); if (changed) { @@ -2243,6 +2250,9 @@ static int sysctl_suid_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { +#ifdef SECURE_KERNEL + return (ENOTSUP); +#endif int new_value, changed; int error = sysctl_io_number(req, sugid_coredump, sizeof(int), &new_value, &changed); if (changed) { diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index 9a4f954b9..123339d3a 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -162,6 +163,15 @@ int load_code_signature( cpu_type_t cputype, load_result_t *result); +#if CONFIG_CODE_DECRYPTION +static load_return_t +set_code_unprotect( + struct encryption_info_command *lcp, + caddr_t addr, + vm_map_t map, + struct vnode *vp); +#endif + static load_return_t load_unixthread( struct thread_command *tcp, @@ -436,7 +446,6 @@ parse_machfile( kfree(kl_addr, kl_size); return(LOAD_IOERROR); } - /* (void)ubc_map(vp, PROT_EXEC); */ /* NOT HERE */ /* * Scan through the commands, processing each one as necessary. @@ -551,6 +560,21 @@ parse_machfile( got_code_signatures = TRUE; } break; +#if CONFIG_CODE_DECRYPTION + case LC_ENCRYPTION_INFO: + if (pass != 2) + break; + ret = set_code_unprotect( + (struct encryption_info_command *) lcp, + addr, map, vp); + if (ret != LOAD_SUCCESS) { + printf("proc %d: set unprotect error %d " + "for file \"%s\"\n", + p->p_pid, ret, vp->v_name); + ret = LOAD_SUCCESS; /* ignore error */ + } + break; +#endif default: /* Other commands are ignored by the kernel */ ret = LOAD_SUCCESS; @@ -597,13 +621,10 @@ parse_machfile( if (kl_addr ) kfree(kl_addr, kl_size); - if (ret == LOAD_SUCCESS) - (void)ubc_map(vp, PROT_READ | PROT_EXEC); - return(ret); } -#ifdef __i386__ +#if CONFIG_CODE_DECRYPTION #define APPLE_UNPROTECTED_HEADER_SIZE (3 * PAGE_SIZE_64) @@ -640,9 +661,14 @@ unprotect_segment_64( map_size -= delta; } /* ... transform the rest of the mapping. */ + struct pager_crypt_info crypt_info; + crypt_info.page_decrypt = dsmos_page_transform; + crypt_info.crypt_ops = NULL; + crypt_info.crypt_end = NULL; kr = vm_map_apple_protected(map, map_addr, - map_addr + map_size); + map_addr + map_size, + &crypt_info); } if (kr != KERN_SUCCESS) { @@ -650,10 +676,10 @@ unprotect_segment_64( } return LOAD_SUCCESS; } -#else /* __i386__ */ +#else /* CONFIG_CODE_DECRYPTION */ #define unprotect_segment_64(file_off, file_size, map, map_addr, map_size) \ LOAD_SUCCESS -#endif /* __i386__ */ +#endif /* CONFIG_CODE_DECRYPTION */ static load_return_t @@ -1293,7 +1319,6 @@ load_dylinker( if (ret == LOAD_SUCCESS) { result->dynlinker = TRUE; result->entry_point = myresult.entry_point; - (void)ubc_map(vp, PROT_READ | PROT_EXEC); } out: vnode_put(vp); @@ -1316,6 +1341,7 @@ load_code_signature( int resid; struct cs_blob *blob; int error; + vm_size_t blob_size; addr = 0; blob = NULL; @@ -1341,7 +1367,8 @@ load_code_signature( goto out; } - kr = kmem_alloc(kernel_map, &addr, round_page(lcp->datasize)); + blob_size = lcp->datasize; + kr = ubc_cs_blob_allocate(&addr, &blob_size); if (kr != KERN_SUCCESS) { ret = LOAD_NOSPACE; goto out; @@ -1383,13 +1410,117 @@ load_code_signature( result->csflags |= blob->csb_flags; } if (addr != 0) { - kmem_free(kernel_map, addr, round_page(lcp->datasize)); + ubc_cs_blob_deallocate(addr, blob_size); addr = 0; } return ret; } + +#if CONFIG_CODE_DECRYPTION + +static load_return_t +set_code_unprotect( + struct encryption_info_command *eip, + caddr_t addr, + vm_map_t map, + struct vnode *vp) +{ + int result, len; + char vpath[MAXPATHLEN]; + pager_crypt_info_t crypt_info; + const char * cryptname = 0; + + size_t offset; + struct segment_command_64 *seg64; + struct segment_command *seg32; + vm_map_offset_t map_offset, map_size; + kern_return_t kr; + + switch(eip->cryptid) { + case 0: + /* not encrypted, just an empty load command */ + return LOAD_SUCCESS; + case 1: + cryptname="com.apple.unfree"; + break; + case 0x10: + /* some random cryptid that you could manually put into + * your binary if you want NULL */ + cryptname="com.apple.null"; + break; + default: + return LOAD_FAILURE; + } + + len = MAXPATHLEN; + result = vn_getpath(vp, vpath, &len); + if(result) return result; + + /* set up decrypter first */ + if(NULL==text_crypter_create) return LOAD_FAILURE; + kr=text_crypter_create(&crypt_info, cryptname, (void*)vpath); + + if(kr) { + printf("set_code_unprotect: unable to find decrypter %s, kr=%d\n", + cryptname, kr); + return LOAD_FAILURE; + } + + /* this is terrible, but we have to rescan the load commands to find the + * virtual address of this encrypted stuff. This code is gonna look like + * the dyld source one day... */ + struct mach_header *header = (struct mach_header *)addr; + size_t mach_header_sz = sizeof(struct mach_header); + if (header->magic == MH_MAGIC_64 || + header->magic == MH_CIGAM_64) { + mach_header_sz = sizeof(struct mach_header_64); + } + offset = mach_header_sz; + uint32_t ncmds = header->ncmds; + while (ncmds--) { + /* + * Get a pointer to the command. + */ + struct load_command *lcp = (struct load_command *)(addr + offset); + offset += lcp->cmdsize; + + switch(lcp->cmd) { + case LC_SEGMENT_64: + seg64 = (struct segment_command_64 *)lcp; + if ((seg64->fileoff <= eip->cryptoff) && + (seg64->fileoff+seg64->filesize >= + eip->cryptoff+eip->cryptsize)) { + map_offset = seg64->vmaddr + eip->cryptoff - seg64->fileoff; + map_size = eip->cryptsize; + goto remap_now; + } + case LC_SEGMENT: + seg32 = (struct segment_command *)lcp; + if ((seg32->fileoff <= eip->cryptoff) && + (seg32->fileoff+seg32->filesize >= + eip->cryptoff+eip->cryptsize)) { + map_offset = seg32->vmaddr + eip->cryptoff - seg32->fileoff; + map_size = eip->cryptsize; + goto remap_now; + } + } + } + + /* if we get here, did not find anything */ + return LOAD_FAILURE; + +remap_now: + /* now remap using the decrypter */ + kr = vm_map_apple_protected(map, map_offset, map_offset+map_size, &crypt_info); + if(kr) printf("set_code_unprotect(): mapping failed with %x\n", kr); + + return LOAD_SUCCESS; +} + +#endif + /* * This routine exists to support the load_dylinker(). * diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index 9d5fb63d6..c8912286e 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -203,7 +203,7 @@ mcache_init(void) (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP); } - PE_parse_boot_arg("mcache_flags", &mcache_flags); + PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags)); mcache_flags &= MCF_FLAGS_MASK; mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t), diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 9ccbc9a9e..4ccfd04fc 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -1001,20 +1001,21 @@ bsdthread_terminate(__unused struct proc *p, struct bsdthread_terminate_args *u #if 0 KERNEL_DEBUG_CONSTANT(0x9000084 |DBG_FUNC_START, (unsigned int)freeaddr, (unsigned int)freesize, (unsigned int)kthport, 0xff, 0); #endif - if (sem != MACH_PORT_NULL) { - kret = semaphore_signal_internal_trap(sem); + if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) { + kret = mach_vm_deallocate(current_map(), freeaddr, freesize); if (kret != KERN_SUCCESS) { return(EINVAL); } } - if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) { - kret = mach_vm_deallocate(current_map(), freeaddr, freesize); + + (void) thread_terminate(current_thread()); + if (sem != MACH_PORT_NULL) { + kret = semaphore_signal_internal_trap(sem); if (kret != KERN_SUCCESS) { return(EINVAL); } } - (void) thread_terminate(current_thread()); if (kthport != MACH_PORT_NULL) mach_port_deallocate(get_task_ipcspace(current_task()), kthport); thread_exception_return(); @@ -1982,7 +1983,6 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl) { - #if defined(__ppc__) /* * Set up PowerPC registers... diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index af0e42341..def0a5aac 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -484,6 +484,13 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, register_t * retval) { continue; newl = ch == '\n'; localbuff[i++] = ch; + /* The original version of this routine contained a buffer + * overflow. At the time, a "small" targeted fix was desired + * so the change below to check the buffer bounds was made. + * TODO: rewrite this needlessly convoluted routine. + */ + if (i == (localbuff_size - 2)) + break; } if (!newl) localbuff[i++] = '\n'; diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 10d694d4c..cb5a66b78 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -528,15 +528,29 @@ 359 ALL { int nosys(void); } #endif +#if CONFIG_WORKQUEUE 360 ALL { user_addr_t bsdthread_create(user_addr_t func, user_addr_t func_arg, user_addr_t stack, user_addr_t pthread, uint32_t flags) NO_SYSCALL_STUB; } 361 ALL { int bsdthread_terminate(user_addr_t stackaddr, size_t freesize, uint32_t port, uint32_t sem) NO_SYSCALL_STUB; } +#else +360 ALL { int nosys(void); } +361 ALL { int nosys(void); } +#endif + 362 ALL { int kqueue(void); } 363 ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } 364 ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); } 365 ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options) NO_SYSCALL_STUB; } + +#if CONFIG_WORKQUEUE 366 ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, int pthsize) NO_SYSCALL_STUB; } 367 ALL { int workq_open(void) NO_SYSCALL_STUB; } 368 ALL { int workq_ops(int options, user_addr_t item, int prio) NO_SYSCALL_STUB; } +#else +366 ALL { int nosys(void); } +367 ALL { int nosys(void); } +368 ALL { int nosys(void); } +#endif + 369 ALL { int nosys(void); } 370 ALL { int nosys(void); } 371 ALL { int nosys(void); } diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 43cd45431..df8047ea8 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -66,6 +66,8 @@ #include +#include + /* XXX These should be in a BSD accessible Mach header, but aren't. */ extern kern_return_t memory_object_pages_resident(memory_object_control_t, boolean_t *); @@ -217,7 +219,6 @@ CS_CodeDirectory *findCodeDirectory( */ cd = (const CS_CodeDirectory *) embedded; } - if (cd && cs_valid_range(cd, cd + 1, lower_bound, upper_bound) && cs_valid_range(cd, (const char *) cd + ntohl(cd->length), @@ -1936,6 +1937,10 @@ ubc_upl_commit_range( if (flags & UPL_COMMIT_FREE_ON_EMPTY) flags |= UPL_COMMIT_NOTIFY_EMPTY; + if (flags & UPL_COMMIT_KERNEL_ONLY_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + pl = UPL_GET_INTERNAL_PAGE_LIST(upl); kr = upl_commit_range(upl, offset, size, flags, @@ -2106,7 +2111,7 @@ UBCINFOEXISTS(struct vnode * vp) /* * CODE SIGNING */ -#define CS_BLOB_KEEP_IN_KERNEL 1 +#define CS_BLOB_PAGEABLE 0 static volatile SInt32 cs_blob_size = 0; static volatile SInt32 cs_blob_count = 0; static SInt32 cs_blob_size_peak = 0; @@ -2123,6 +2128,39 @@ SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD, &cs_blob_count_peak, 0 SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_peak, CTLFLAG_RD, &cs_blob_size_peak, 0, "Peak size of code signature blobs"); SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_max, CTLFLAG_RD, &cs_blob_size_max, 0, "Size of biggest code signature blob"); +kern_return_t +ubc_cs_blob_allocate( + vm_offset_t *blob_addr_p, + vm_size_t *blob_size_p) +{ + kern_return_t kr; + +#if CS_BLOB_PAGEABLE + *blob_size_p = round_page(*blob_size_p); + kr = kmem_alloc(kernel_map, blob_addr_p, *blob_size_p); +#else /* CS_BLOB_PAGEABLE */ + *blob_addr_p = (vm_offset_t) kalloc(*blob_size_p); + if (*blob_addr_p == 0) { + kr = KERN_NO_SPACE; + } else { + kr = KERN_SUCCESS; + } +#endif /* CS_BLOB_PAGEABLE */ + return kr; +} + +void +ubc_cs_blob_deallocate( + vm_offset_t blob_addr, + vm_size_t blob_size) +{ +#if CS_BLOB_PAGEABLE + kmem_free(kernel_map, blob_addr, blob_size); +#else /* CS_BLOB_PAGEABLE */ + kfree((void *) blob_addr, blob_size); +#endif /* CS_BLOB_PAGEABLE */ +} + int ubc_cs_blob_add( struct vnode *vp, @@ -2148,6 +2186,7 @@ ubc_cs_blob_add( return ENOMEM; } +#if CS_BLOB_PAGEABLE /* get a memory entry on the blob */ blob_size = (memory_object_size_t) size; kr = mach_make_memory_entry_64(kernel_map, @@ -2168,7 +2207,10 @@ ubc_cs_blob_add( error = EINVAL; goto out; } - +#else + blob_size = (memory_object_size_t) size; + blob_handle = IPC_PORT_NULL; +#endif /* fill in the new blob */ blob->csb_cpu_type = cputype; @@ -2177,7 +2219,6 @@ ubc_cs_blob_add( blob->csb_mem_offset = 0; blob->csb_mem_handle = blob_handle; blob->csb_mem_kaddr = addr; - /* * Validate the blob's contents @@ -2207,7 +2248,15 @@ ubc_cs_blob_add( SHA1Final(blob->csb_sha1, &sha1ctxt); } - + /* + * Let policy module check whether the blob's signature is accepted. + */ +#if CONFIG_MACF + error = mac_vnode_check_signature(vp, blob->csb_sha1, (void*)addr, size); + if (error) + goto out; +#endif + /* * Validate the blob's coverage */ @@ -2328,10 +2377,6 @@ ubc_cs_blob_add( blob->csb_flags); } -#if !CS_BLOB_KEEP_IN_KERNEL - blob->csb_mem_kaddr = 0; -#endif /* CS_BLOB_KEEP_IN_KERNEL */ - vnode_unlock(vp); error = 0; /* success ! */ @@ -2347,10 +2392,6 @@ ubc_cs_blob_add( mach_memory_entry_port_release(blob_handle); blob_handle = IPC_PORT_NULL; } - } else { -#if !CS_BLOB_KEEP_IN_KERNEL - kmem_free(kernel_map, addr, size); -#endif /* CS_BLOB_KEEP_IN_KERNEL */ } if (error == EAGAIN) { @@ -2363,7 +2404,7 @@ ubc_cs_blob_add( /* * Since we're not failing, consume the data we received. */ - kmem_free(kernel_map, addr, size); + ubc_cs_blob_deallocate(addr, size); } return error; @@ -2421,12 +2462,13 @@ ubc_cs_free( blob = next_blob) { next_blob = blob->csb_next; if (blob->csb_mem_kaddr != 0) { - kmem_free(kernel_map, - blob->csb_mem_kaddr, - blob->csb_mem_size); + ubc_cs_blob_deallocate(blob->csb_mem_kaddr, + blob->csb_mem_size); blob->csb_mem_kaddr = 0; } - mach_memory_entry_port_release(blob->csb_mem_handle); + if (blob->csb_mem_handle != IPC_PORT_NULL) { + mach_memory_entry_port_release(blob->csb_mem_handle); + } blob->csb_mem_handle = IPC_PORT_NULL; OSAddAtomic(-1, &cs_blob_count); OSAddAtomic(-blob->csb_mem_size, &cs_blob_size); @@ -2537,9 +2579,6 @@ cs_validate_page( cd->hashType != 0x1 || cd->hashSize != SHA1_RESULTLEN) { /* bogus blob ? */ -#if !CS_BLOB_KEEP_IN_KERNEL - kmem_free(kernel_map, kaddr, ksize); -#endif /* CS_BLOB_KEEP_IN_KERNEL */ continue; } @@ -2549,9 +2588,6 @@ cs_validate_page( if (offset < start_offset || offset >= end_offset) { /* our page is not covered by this blob */ -#if !CS_BLOB_KEEP_IN_KERNEL - kmem_free(kernel_map, kaddr, ksize); -#endif /* CS_BLOB_KEEP_IN_KERNEL */ continue; } @@ -2564,11 +2600,6 @@ cs_validate_page( found_hash = TRUE; } -#if !CS_BLOB_KEEP_IN_KERNEL - /* we no longer need that blob in the kernel map */ - kmem_free(kernel_map, kaddr, ksize); -#endif /* CS_BLOB_KEEP_IN_KERNEL */ - break; } } @@ -2591,9 +2622,9 @@ cs_validate_page( validated = FALSE; *tainted = FALSE; } else { - const uint32_t *asha1, *esha1; size = PAGE_SIZE; + const uint32_t *asha1, *esha1; if (offset + size > codeLimit) { /* partial page at end of segment */ assert(offset < codeLimit); @@ -2601,7 +2632,7 @@ cs_validate_page( } /* compute the actual page's SHA1 hash */ SHA1Init(&sha1ctxt); - SHA1Update(&sha1ctxt, data, size); + SHA1UpdateUsePhysicalAddress(&sha1ctxt, data, size); SHA1Final(actual_hash, &sha1ctxt); asha1 = (const uint32_t *) actual_hash; diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index aba70cc66..1784e5f1d 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1026,7 +1026,7 @@ mbinit(void) VERIFY(slabstbl != NULL); /* Allocate audit structures if needed */ - PE_parse_boot_arg("mbuf_debug", &mbuf_debug); + PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); mbuf_debug |= mcache_getflags(); if (mbuf_debug & MCF_AUDIT) { MALLOC(mclaudit, mcl_audit_t *, @@ -1051,7 +1051,7 @@ mbinit(void) embutl = (union mcluster *) ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); - PE_parse_boot_arg("initmcl", &initmcl); + PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); lck_mtx_lock(mbuf_mlock); diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 7b259ec9f..57dff6de9 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -245,7 +245,7 @@ socketinit(void) return; } - PE_parse_boot_arg("socket_debug", &socket_debug); + PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug)); /* * allocate lock group attribute and group for socket cache mutex diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 5a44dee4f..b8984ac80 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -54,9 +54,7 @@ DATAFILES = \ fremovexattr.2 \ fsetxattr.2 \ fstat.2 \ - fstat64.2 \ fstatfs.2 \ - fstatfs64.2 \ fsync.2 \ ftruncate.2 \ futimes.2 \ @@ -102,7 +100,6 @@ DATAFILES = \ listxattr.2 \ lseek.2 \ lstat.2 \ - lstat64.2 \ madvise.2 \ mincore.2 \ minherit.2 \ @@ -181,9 +178,7 @@ DATAFILES = \ socket.2 \ socketpair.2 \ stat.2 \ - stat64.2 \ statfs.2 \ - statfs64.2 \ symlink.2 \ sync.2 \ syscall.2 \ diff --git a/bsd/man/man2/fstat64.2 b/bsd/man/man2/fstat64.2 deleted file mode 100644 index b1a86c195..000000000 --- a/bsd/man/man2/fstat64.2 +++ /dev/null @@ -1 +0,0 @@ -.so man2/stat.2 diff --git a/bsd/man/man2/fstatfs64.2 b/bsd/man/man2/fstatfs64.2 deleted file mode 100644 index 923d3c0cc..000000000 --- a/bsd/man/man2/fstatfs64.2 +++ /dev/null @@ -1 +0,0 @@ -.so man2/statfs.2 diff --git a/bsd/man/man2/lstat64.2 b/bsd/man/man2/lstat64.2 deleted file mode 100644 index 4fe4fb441..000000000 --- a/bsd/man/man2/lstat64.2 +++ /dev/null @@ -1,2 +0,0 @@ -.so man2/stat.2 - diff --git a/bsd/man/man2/stat.2 b/bsd/man/man2/stat.2 index 0c9f062e0..53704ece8 100644 --- a/bsd/man/man2/stat.2 +++ b/bsd/man/man2/stat.2 @@ -38,11 +38,8 @@ .Os BSD 4 .Sh NAME .Nm fstat , -.Nm fstat64 , .Nm lstat , -.Nm lstat64 , -.Nm stat , -.Nm stat64 +.Nm stat .Nd get file status .Sh SYNOPSIS .Fd #include @@ -52,34 +49,19 @@ .Fa "struct stat *buf" .Fc .Ft int -.Fo fstat64 -.Fa "int fildes" -.Fa "struct stat64 *buf" -.Fc -.Ft int .Fo lstat .Fa "const char *restrict path" .Fa "struct stat *restrict buf" .Fc .Ft int -.Fo lstat64 -.Fa "const char *restrict path" -.Fa "struct stat64 *restrict buf" -.Fc -.Ft int .Fo stat .Fa "const char *restrict path" .Fa "struct stat *restrict buf" .Fc -.Ft int -.Fo stat64 -.Fa "const char *restrict path" -.Fa "struct stat64 *restrict buf" -.Fc .Sh DESCRIPTION The .Fn stat -family of functions and their 64 bit variants obtain information about a file. The +family of functions obtain information about a file. The .Fn stat function obtains information about the file pointed to by .Fa path . @@ -116,38 +98,16 @@ The .Fa buf argument is a pointer to a .Fa stat -or -.Fa stat64 structure as defined by .Aq Pa sys/stat.h -(both shown below) and into which information is placed concerning the file. .Bd -literal struct stat { - dev_t st_dev; /* device inode resides on */ - ino_t st_ino; /* inode's number */ - mode_t st_mode; /* inode protection mode */ - nlink_t st_nlink; /* number or hard links to the file */ - uid_t st_uid; /* user-id of owner */ - gid_t st_gid; /* group-id of owner */ - dev_t st_rdev; /* device type, for special file inode */ - struct timespec st_atimespec; /* time of last access */ - struct timespec st_mtimespec; /* time of last data modification */ - struct timespec st_ctimespec; /* time of last file status change */ - off_t st_size; /* file size, in bytes */ - quad_t st_blocks; /* blocks allocated for file */ - u_long st_blksize;/* optimal file sys I/O ops blocksize */ - u_long st_flags; /* user defined flags for file */ - u_long st_gen; /* file generation number */ -}; - - -struct stat64 { dev_t st_dev; /* ID of device containing file */ mode_t st_mode; /* Mode of file (see below) */ nlink_t st_nlink; /* Number of hard links */ - ino64_t st_ino; /* File serial number */ + ino_t st_ino; /* File serial number */ uid_t st_uid; /* User ID of the file */ gid_t st_gid; /* Group ID of the file */ dev_t st_rdev; /* Device ID */ @@ -169,8 +129,6 @@ struct stat64 { .Pp The time-related fields of .Fa struct stat -and -.Fa struct stat64 are as follows: .Bl -tag -width XXXst_birthtime .It st_atime @@ -203,9 +161,8 @@ and .Xr write 2 system calls. .It st_birthtime -Time of file creation. Only set once when the file is created. This field is -only available in the 64 bit variants. On filesystems where birthtime is -not available, this field holds the +Time of file creation. Only set once when the file is created. +On filesystems where birthtime is not available, this field holds the .Fa ctime instead. .El @@ -343,23 +300,6 @@ in the structure pointed to by The file generation number, .Fa st_gen , is only available to the super-user. -.br -The fields in the stat structure currently marked -.Fa st_spare1 , -.Fa st_spare2 , -and -.Fa st_spare3 -are present in preparation for inode time stamps expanding -to 64 bits. This, however, can break certain programs that -depend on the time stamps being contiguous (in calls to -.Xr utimes 2 ) . -.Sh LEGACY SYNOPSIS -.Fd #include -.Fd #include -.Pp -The include file -.In sys/types.h -is necessary. .Sh SEE ALSO .Xr chflags 2 , .Xr chmod 2 , @@ -386,9 +326,3 @@ An .Fn lstat function call appeared in .Bx 4.2 . -The -.Fn stat64 , -.Fn fstat64 , -and -.Fn lstat64 -system calls first appeared in Mac OS X 10.5 (Leopard). diff --git a/bsd/man/man2/stat64.2 b/bsd/man/man2/stat64.2 deleted file mode 100644 index b1a86c195..000000000 --- a/bsd/man/man2/stat64.2 +++ /dev/null @@ -1 +0,0 @@ -.so man2/stat.2 diff --git a/bsd/man/man2/statfs.2 b/bsd/man/man2/statfs.2 index 1ababf3f8..4b6a3db35 100644 --- a/bsd/man/man2/statfs.2 +++ b/bsd/man/man2/statfs.2 @@ -38,9 +38,7 @@ .Os .Sh NAME .Nm statfs, -.Nm statfs64, -.Nm fstatfs, -.Nm fstatfs64 +.Nm fstatfs .Nd get file system statistics .Sh SYNOPSIS .Fd #include @@ -48,11 +46,7 @@ .Ft int .Fn statfs "const char *path" "struct statfs *buf" .Ft int -.Fn statfs64 "const char *path" "struct statfs64 *buf" -.Ft int .Fn fstatfs "int fd" "struct statfs *buf" -.Ft int -.Fn fstatfs64 "int fd" "struct statfs64 *buf" .Sh DESCRIPTION .Fn Statfs returns information about a mounted file system. @@ -61,41 +55,14 @@ is the path name of any file within the mounted file system. .Fa Buf is a pointer to a .Fa statfs -or -.Fa statfs64 structure defined as follows: .Bd -literal typedef struct { int32_t val[2]; } fsid_t; -#define MFSNAMELEN 15 /* length of fs type name, not inc. nul */ -#define MNAMELEN 90 /* length of buffer for returned name */ #define MFSTYPENAMELEN 16 /* length of fs type name including null */ #define MAXPATHLEN 1024 struct statfs { - short f_otype; /* type of file system (reserved: zero) */ - short f_oflags; /* copy of mount flags (reserved: zero) */ - long f_bsize; /* fundamental file system block size */ - long f_iosize; /* optimal transfer block size */ - long f_blocks; /* total data blocks in file system */ - long f_bfree; /* free blocks in fs */ - long f_bavail; /* free blocks avail to non-superuser */ - long f_files; /* total file nodes in file system */ - long f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ - uid_t f_owner; /* user that mounted the file system */ - short f_reserved1; /* reserved for future use */ - short f_type; /* type of file system (reserved) */ - long f_flags; /* copy of mount flags (reserved) */ - long f_reserved2[2]; /* reserved for future use */ - char f_fstypename[MFSNAMELEN]; /* fs type name */ - char f_mntonname[MNAMELEN]; /* directory on which mounted */ - char f_mntfromname[MNAMELEN]; /* mounted file system */ - char f_reserved3; /* reserved for future use */ - long f_reserved4[4]; /* reserved for future use */ -}; - -struct statfs64 { uint32_t f_bsize; /* fundamental file system block size */ int32_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in file system */ @@ -223,8 +190,4 @@ error occurred while reading from or writing to the file system. .Sh HISTORY The .Fn statfs -function first appeared in 4.4BSD. The -.Fn statfs64 -and -.Fn fstatfs64 -first appeared in Max OS X 10.5 (Leopard). +function first appeared in 4.4BSD. diff --git a/bsd/man/man2/statfs64.2 b/bsd/man/man2/statfs64.2 deleted file mode 100644 index 3a64852e4..000000000 --- a/bsd/man/man2/statfs64.2 +++ /dev/null @@ -1,3 +0,0 @@ -.so man2/statfs.2 - - diff --git a/bsd/man/man5/dir.5 b/bsd/man/man5/dir.5 index b33c223c0..891e5db37 100644 --- a/bsd/man/man5/dir.5 +++ b/bsd/man/man5/dir.5 @@ -88,32 +88,25 @@ and further in the file .Bd -literal /*** Excerpt from ***/ /* - * The dirent structure defines the format of directory entries returned by - * the getdirentries(2) system call. + * The dirent structure defines the format of directory entries. * * A directory entry has a struct dirent at the front of it, containing its * inode number, the length of the entry, and the length of the name * contained in the entry. These are followed by the name padded to a 4 * byte boundary with null bytes. All names are guaranteed null terminated. - * The maximum length of a name in a directory is MAXNAMLEN. - * The dirent structure defines the format of directory entries returned by - * the getdirentries(2) system call. + * The maximum length of a name in a directory is MAXPATHLEN. */ #ifndef _SYS_DIRENT_H #define _SYS_DIRENT_H struct dirent { - u_int32_t d_fileno; /* file number of entry */ + ino_t d_ino; /* file number of entry */ + u_int64_t d_seekoff; /* length of this record */ u_int16_t d_reclen; /* length of this record */ + u_int16_t d_namlen; /* length of string in d_name */ u_int8_t d_type; /* file type, see below */ - u_int8_t d_namlen; /* length of string in d_name */ -#ifdef _POSIX_SOURCE - char d_name[255 + 1]; /* name must be no longer than this */ -#else -#define MAXNAMLEN 255 - char d_name[MAXNAMLEN + 1]; /* name must be no longer than this */ -#endif + char d_name[MAXPATHLEN]; /* name must be no longer than this */ }; /* @@ -139,51 +132,26 @@ struct dirent { #ifndef _DIRENT_H #define _DIRENT_H -#ifdef _POSIX_SOURCE -typedef void * DIR; -#else - -#define d_ino d_fileno /* backward compatibility */ - /* definitions for library routines operating on directories. */ #define DIRBLKSIZ 1024 struct _telldir; /* see telldir.h */ /* structure describing an open directory. */ -typedef struct _dirdesc { - int dd_fd; /* file descriptor associated with directory */ - long dd_loc; /* offset in current buffer */ - long dd_size; /* amount of data returned by getdirentries */ - char *dd_buf; /* data buffer */ - int dd_len; /* size of data buffer */ - long dd_seek; /* magic cookie returned by getdirentries */ - long dd_rewind; /* magic cookie for rewinding */ - int dd_flags; /* flags for readdir */ - pthread_mutex_t dd_lock; /* for thread locking */ - struct _telldir *dd_td; /* telldir position recording */ +typedef struct { + int __dd_fd; /* file descriptor associated with directory */ + long __dd_loc; /* offset in current buffer */ + long __dd_size; /* amount of data returned by getdirentries */ + char *__dd_buf; /* data buffer */ + int __dd_len; /* size of data buffer */ + long __dd_seek; /* magic cookie returned by getdirentries */ + long __dd_rewind; /* magic cookie for rewinding */ + int __dd_flags; /* flags for readdir */ + pthread_mutex_t __dd_lock; /* for thread locking */ + struct _telldir *__dd_td; /* telldir position recording */ } DIR; -#define dirfd(dirp) ((dirp)->dd_fd) - -/* flags for opendir2 */ -#define DTF_HIDEW 0x0001 /* hide whiteout entries */ -#define DTF_NODUP 0x0002 /* don't return duplicate names */ -/* structure describing an open directory. */ -typedef struct _dirdesc { - int dd_fd; /* file descriptor associated with directory */ - long dd_loc; /* offset in current buffer */ - long dd_size; /* amount of data returned by getdirentries */ - char *dd_buf; /* data buffer */ - int dd_len; /* size of data buffer */ - long dd_seek; /* magic cookie returned by getdirentries */ - long dd_rewind; /* magic cookie for rewinding */ - int dd_flags; /* flags for readdir */ - pthread_mutex_t dd_lock; /* for thread locking */ - struct _telldir *dd_td; /* telldir position recording */ -} DIR; - -#define dirfd(dirp) ((dirp)->dd_fd) +#define dirfd(dirp) ((dirp)->__dd_fd) /* flags for opendir2 */ #define DTF_HIDEW 0x0001 /* hide whiteout entries */ @@ -191,12 +159,6 @@ typedef struct _dirdesc { #define DTF_REWIND 0x0004 /* rewind after reading union stack */ #define __DTF_READALL 0x0008 /* everything has been read */ -#ifndef NULL -#define NULL 0 -#endif - -#endif /* _POSIX_SOURCE */ - #endif /* !_DIRENT_H_ */ .Ed .Sh SEE ALSO diff --git a/bsd/man/man5/types.5 b/bsd/man/man5/types.5 index 8226db199..9f030a865 100644 --- a/bsd/man/man5/types.5 +++ b/bsd/man/man5/types.5 @@ -93,7 +93,7 @@ typedef long * qaddr_t; /* should be typedef quad * qaddr_t; */ typedef long daddr_t; typedef char * caddr_t; -typedef u_long ino_t; +typedef u_int64_t ino_t; typedef long swblk_t; typedef long segsz_t; typedef long off_t; diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index d9aedf798..6c7f5af9d 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -565,14 +565,19 @@ devfsspec_close(struct vnop_close_args *ap) struct vnode * vp = ap->a_vp; register devnode_t * dnp; struct timeval now; + int ref = 1; - if (vnode_isinuse(vp, 1)) { + if (vp->v_type == VBLK) + ref = 0; + + if (vnode_isinuse(vp, ref)) { DEVFS_LOCK(); microtime(&now); dnp = VTODN(vp); dn_times(dnp, &now, &now, &now); DEVFS_UNLOCK(); } + return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 752afe8df..aad1b0250 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -590,7 +590,6 @@ spec_fsync(struct vnop_fsync_args *ap) */ extern int hard_throttle_on_root; void IOSleep(int); -extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount); // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond #define LOWPRI_INITIAL_WINDOW_MSECS 100 @@ -599,6 +598,12 @@ extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount); #define LOWPRI_MAX_WAITING_MSECS 200 #define LOWPRI_SLEEP_INTERVAL 5 +struct _throttle_io_info_t { + struct timeval last_normal_IO_timestamp; + SInt32 numthreads_throttling; +}; + +struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS; int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC; int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS; @@ -609,40 +614,74 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -void throttle_lowpri_io(int *lowpri_window,mount_t v_mount) +int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit) { - int i; - struct timeval last_lowpri_IO_timestamp,last_normal_IO_timestamp; struct timeval elapsed; - int lowpri_IO_window_msecs; - struct timeval lowpri_IO_window; - int max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL; + int elapsed_msecs; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, - *lowpri_window, 0, 0, 0, 0); + microuptime(&elapsed); + timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; - last_normal_IO_timestamp = v_mount->last_normal_IO_timestamp; - - for (i=0; iuu_lowpri_window == 0) + return; + + max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, + ut->uu_lowpri_window, 0, 0, 0, 0); + + if (ok_to_sleep == TRUE) { + for (i=0; iuu_lowpri_window, ut->uu_devbsdunit)) { + IOSleep(LOWPRI_SLEEP_INTERVAL); + } else { + break; + } } } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, - *lowpri_window, i*5, 0, 0, 0); - *lowpri_window = 0; + ut->uu_lowpri_window, i*5, 0, 0, 0); + SInt32 oldValue; + oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + ut->uu_lowpri_window = 0; + + if (oldValue <= 0) { + panic("%s: numthreads negative", __func__); + } +} + +int throttle_get_io_policy(struct uthread **ut) +{ + int policy = IOPOL_DEFAULT; + proc_t p = current_proc(); + + *ut = get_bsdthread_info(current_thread()); + + if (p != NULL) + policy = p->p_iopol_disk; + + if (*ut != NULL) { + // the I/O policy of the thread overrides that of the process + // unless the I/O policy of the thread is default + if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) + policy = (*ut)->uu_iopol_disk; + } + return policy; } int @@ -677,23 +716,14 @@ spec_strategy(struct vnop_strategy_args *ap) hard_throttle_on_root = 1; if (lowpri_IO_initial_window_msecs) { - proc_t p; struct uthread *ut; - int policy = IOPOL_DEFAULT; + int policy; int is_throttleable_io = 0; int is_passive_io = 0; - p = current_proc(); - ut = get_bsdthread_info(current_thread()); - - if (p != NULL) - policy = p->p_iopol_disk; - - if (ut != NULL) { - // the I/O policy of the thread overrides that of the process - // unless the I/O policy of the thread is default - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; - } + size_t devbsdunit; + SInt32 oldValue; + + policy = throttle_get_io_policy(&ut); switch (policy) { case IOPOL_DEFAULT: @@ -713,9 +743,13 @@ spec_strategy(struct vnop_strategy_args *ap) if (!is_throttleable_io && ISSET(bflags, B_PASSIVE)) is_passive_io |= 1; + if (buf_vnode(bp)->v_mount != NULL) + devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit; + else + devbsdunit = LOWPRI_MAX_NUM_DEV - 1; if (!is_throttleable_io) { - if (!is_passive_io && buf_vnode(bp)->v_mount != NULL){ - microuptime(&(buf_vnode(bp)->v_mount->last_normal_IO_timestamp)); + if (!is_passive_io){ + microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp); } } else { /* @@ -728,14 +762,25 @@ spec_strategy(struct vnop_strategy_args *ap) * do the delay just before we return from the system * call that triggered this I/O or from vnode_pagein */ - if(buf_vnode(bp)->v_mount != NULL) - ut->v_mount = buf_vnode(bp)->v_mount; if (ut->uu_lowpri_window == 0) { + ut->uu_devbsdunit = devbsdunit; + oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling); + if (oldValue < 0) { + panic("%s: numthreads negative", __func__); + } ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; } else { - ut->uu_lowpri_window += lowpri_IO_window_msecs_inc; - if (ut->uu_lowpri_window > lowpri_max_window_msecs) - ut->uu_lowpri_window = lowpri_max_window_msecs; + if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call + // keep track of the numthreads in the right device + OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling); + ut->uu_devbsdunit = devbsdunit; + } + int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling); + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; + if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) + ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; } } } @@ -827,7 +872,7 @@ spec_close(struct vnop_close_args *ap) * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1) + if (vcount(vp) > 0) return (0); #else /* DEVFS_IMPLEMENTS_LOCKING */ /* @@ -837,7 +882,7 @@ spec_close(struct vnop_close_args *ap) * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1) + if (vcount(vp) > 0) return (0); /* diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index e3b16f486..d38346b0c 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -564,7 +564,7 @@ dlil_affinity_set(struct thread *tp, u_int32_t tag) void dlil_init(void) { - PE_parse_boot_arg("net_affinity", &net_affinity); + PE_parse_boot_argn("net_affinity", &net_affinity, sizeof (net_affinity)); TAILQ_INIT(&dlil_ifnet_head); TAILQ_INIT(&ifnet_head); diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index 8a4da6c6c..c2607645a 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -406,11 +406,35 @@ ether_demux( } } - /* Quick check for VLAN */ - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0 || - ether_type == htons(ETHERTYPE_VLAN)) { - *protocol_family = PF_VLAN; - return 0; + /* check for VLAN */ + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) { + if (EVL_VLANOFTAG(m->m_pkthdr.vlan_tag) != 0) { + *protocol_family = PF_VLAN; + return (0); + } + /* the packet is just priority-tagged, clear the bit */ + m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID; + } + else if (ether_type == htons(ETHERTYPE_VLAN)) { + struct ether_vlan_header * evl; + + evl = (struct ether_vlan_header *)frame_header; + if (m->m_len < ETHER_VLAN_ENCAP_LEN + || ntohs(evl->evl_proto) == ETHERTYPE_VLAN + || EVL_VLANOFTAG(ntohs(evl->evl_tag)) != 0) { + *protocol_family = PF_VLAN; + return 0; + } + /* the packet is just priority-tagged */ + + /* make the encapsulated ethertype the actual ethertype */ + ether_type = evl->evl_encap_proto = evl->evl_proto; + + /* remove the encapsulation header */ + m->m_len -= ETHER_VLAN_ENCAP_LEN; + m->m_data += ETHER_VLAN_ENCAP_LEN; + m->m_pkthdr.len -= ETHER_VLAN_ENCAP_LEN; + m->m_pkthdr.csum_flags = 0; /* can't trust hardware checksum */ } data = mtod(m, u_int8_t*); diff --git a/bsd/net/if.h b/bsd/net/if.h index a883f4715..7e82691c2 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -148,6 +148,8 @@ struct if_clonereq64 { #define IFEF_VLAN 0x200 /* interface has one or more vlans */ #define IFEF_BOND 0x400 /* interface is part of bond */ #define IFEF_ARPLL 0x800 /* ARP for IPv4LL addresses on this port */ +#define IFEF_NOWINDOWSCALE 0x1000 /* TCP window scale disabled on this interface, see 5933937 & 5959897*/ +#define IFEF_NOTIMESTAMPS IFEF_NOWINDOWSCALE /* We don't actualy disable timestamps, just window scale see 5959897 */ #define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ #define IFEF_REUSE 0x20000000 /* DLIL ifnet recycler, ifnet is not new */ #define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 3f5c2c14c..4a783b6d8 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1108,6 +1108,7 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, /* We found a vlan interface, inject on that interface. */ dlil_input_packet_list(ifp, m); } else { + m->m_pkthdr.header = frame_header; /* Send priority-tagged packet up through the parent */ dlil_input_packet_list(p, m); } diff --git a/bsd/net/route.c b/bsd/net/route.c index e00ce3eaa..7f4ec5ac6 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -186,7 +186,7 @@ route_init(void) { int size; - PE_parse_boot_arg("rte_debug", &rte_debug); + PE_parse_boot_argn("rte_debug", &rte_debug, sizeof (rte_debug)); if (rte_debug != 0) rte_debug |= RTD_DEBUG; diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index de3d2890a..26195cc07 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -34,7 +34,7 @@ KERNELFILES = \ PRIVATE_DATAFILES = \ if_fddi.h if_atm.h ip_dummynet.h \ tcp_debug.h \ - in_gif.h ip_compat.h + in_gif.h ip_compat.h ip_edgehole.h PRIVATE_KERNELFILES = ${KERNELFILES} \ ip_ecn.h ip_encap.h ip_flow.h diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index d764b3718..174aa7742 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -242,7 +242,8 @@ arp_rtrequest( gate = rt->rt_gateway; SDL(gate)->sdl_type = rt->rt_ifp->if_type; SDL(gate)->sdl_index = rt->rt_ifp->if_index; - rt->rt_expire = timenow.tv_sec; + /* In case we're called before 1.0 sec. has elapsed */ + rt->rt_expire = MAX(timenow.tv_sec, 1); break; } /* Announce a new entry if requested. */ @@ -296,7 +297,8 @@ arp_rtrequest( gate_ll->sdl_alen = broadcast_len; gate_ll->sdl_family = AF_LINK; gate_ll->sdl_len = sizeof(struct sockaddr_dl); - rt->rt_expire = timenow.tv_sec; + /* In case we're called before 1.0 sec. has elapsed */ + rt->rt_expire = MAX(timenow.tv_sec, 1); } #endif diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index 28601114c..c9fc56d86 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -156,7 +156,7 @@ inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0); /* sanity check */ - if (m->m_pkthdr.len < skip + len) { + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", m->m_pkthdr.len, skip, len); } @@ -248,7 +248,7 @@ inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0); /* sanity check */ - if (m->m_pkthdr.len < skip + len) { + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", m->m_pkthdr.len, skip, len); } diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index a17a1fb7e..fce3bb78b 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -230,6 +230,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc * return (mac_error); } mac_inpcb_label_associate(so, inp); +#endif +#if CONFIG_IP_EDGEHOLE + ip_edgehole_attach(inp); #endif so->so_pcb = (caddr_t)inp; diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 3fd86be88..0186e42a5 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -203,6 +203,10 @@ struct inpcb { void *pdp_ifp; #endif /* _KERN_SYS_KERNELTYPES_H_ */ #endif /* CONFIG_EMBEDDED */ +#if CONFIG_IP_EDGEHOLE + u_int32_t inpcb_edgehole_flags; + u_int32_t inpcb_edgehole_mask; +#endif }; #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index 73ab3ea91..ebc0772b0 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -370,6 +370,9 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, socket_unlock(so, 0); #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); +#endif +#if CONFIG_IP_EDGEHOLE + ip_edgehole_mbuf_tag(inp, m); #endif error = ip_output(m, inp->inp_options, &inp->inp_route, diff --git a/bsd/netinet/ip_edgehole.c b/bsd/netinet/ip_edgehole.c new file mode 100644 index 000000000..aa56449ea --- /dev/null +++ b/bsd/netinet/ip_edgehole.c @@ -0,0 +1,333 @@ +#include +#include +#include +#include +#include +#include // For bzero +#include // for printf +#include // For panic +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ip_edgehole.h" + +enum +{ + kEdgeHoleFlag_BlockInternet = 0x00000001, + kEdgeHoleFlag_BlockVV = 0x00000002 +}; + +struct edgehole_tag +{ + // flags tells us whether or not we should block traffic + u_int32_t eh_flags; + + // These fields are used to help us find the PCB after we block traffic for TCP + struct inpcbinfo *eh_inpinfo; + struct inpcb *eh_inp; +}; + +struct edgehole_delayed_notify +{ + // flags tells us whether or not we should block traffic + struct edgehole_delayed_notify *next; + + // These fields are used to help us find the PCB after we block traffic for TCP + struct inpcbinfo *inpinfo; + struct inpcb *inp; +}; + +static mbuf_tag_id_t edgehole_tag = 0; +static thread_call_t edgehole_callout = NULL; +static OSMallocTag edgehole_mtag = 0; +static struct edgehole_delayed_notify *edgehole_delay_list = NULL; + +#ifndef HAS_COMPARE_AND_SWAP_PTR +// 64bit kernels have an OSCompareAndSwapPtr that does the right thing +static Boolean +OSCompareAndSwapPtr( + void *oldValue, + void *newValue, + volatile void *address) +{ + return OSCompareAndSwap((UInt32)oldValue, (UInt32)newValue, (volatile UInt32*)address); +} +#endif + +static void +ip_edgehole_notify_delayed( + struct inpcb *inp, + struct inpcbinfo *inpinfo) +{ + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) + { + // We've found an inpcb for the packet we're dropping. + struct socket *so = inp->inp_socket; + if (so && so != &inpinfo->nat_dummy_socket) + { + socket_lock(so, 1); + if (in_pcb_checkstate(inp, WNT_RELEASE,1) != WNT_STOPUSING) + { + if (inp->inp_ip_p == IPPROTO_TCP) + { + // Why do we still have caddr_t? Come on! Casting from + // caddr_t to something else causes "cast increases required alignment" + // warnings. warnings are treated as failures. This union does the + // exact same thing without the warning. + union + { + caddr_t caddrt_sucks; + void *void_ptr; + } bite_me; + + bite_me.caddrt_sucks = inp->inp_ppcb; + tcp_drop((struct tcpcb*)bite_me.void_ptr, EPERM); + } + else + { + // Is this enough? + socantsendmore(so); + } + } + socket_unlock(so, 1); + } + } +} + +// Some shortcomings of this strategy: +// 1) an inpcb could be reused for a new socket before we get a chance to notify + +static void +ip_edgehole_process_delayed( + __unused void *unused1, + __unused void *unused2) +{ + struct edgehole_delayed_notify *head; + + while (edgehole_delay_list) + { + // Atomically grab the list + do + { + head = edgehole_delay_list; + } + while (!OSCompareAndSwapPtr(head, NULL, &edgehole_delay_list)); + + if (head == NULL) + { + break; + } + + // Prune duplicates from the list + struct edgehole_delayed_notify *current; + struct edgehole_delayed_notify **current_p; + struct edgehole_delayed_notify *ye_dead; + for (current = head; current && current->next; current = current->next) + { + current_p = &head; + while (*current_p) + { + if ((*current_p)->inp == current->inp) + { + ye_dead = *current_p; + *current_p = ye_dead->next; + OSFree(ye_dead, sizeof(*ye_dead), edgehole_mtag); + } + else + { + current_p = &(*current_p)->next; + } + } + } + + while (head) + { + struct inpcbinfo *lockedinfo; + + lockedinfo = head->inpinfo; + + // Lock the list + lck_rw_lock_shared(lockedinfo->mtx); + + struct inpcb *inp; + + // Walk the inp list. + LIST_FOREACH(inp, lockedinfo->listhead, inp_list) + { + // Walk the list of notifications + for (current = head; current != NULL; current = current->next) + { + // Found a match, notify + if (current->inpinfo == lockedinfo && current->inp == inp) + { + ip_edgehole_notify_delayed(inp, lockedinfo); + } + } + } + + lck_rw_done(lockedinfo->mtx); + + // Release all the notifications for this inpcbinfo + current_p = &head; + while (*current_p) + { + // Free any items for this inpcbinfo + if ((*current_p)->inpinfo == lockedinfo) + { + ye_dead = *current_p; + *current_p = ye_dead->next; + OSFree(ye_dead, sizeof(*ye_dead), edgehole_mtag); + } + else + { + current_p = &(*current_p)->next; + } + } + } + } +} + +static void +ip_edgehole_notify( + struct edgehole_tag *tag) +{ + // Since the lock on the socket may be held while a packet is being transmitted, + // we must allocate storage to keep track of this information and schedule a + // thread to handle the work. + + if (tag->eh_inp == NULL || tag->eh_inpinfo == NULL) + return; + + struct edgehole_delayed_notify *delayed = OSMalloc(sizeof(*delayed), edgehole_mtag); + if (delayed) + { + delayed->inp = tag->eh_inp; + delayed->inpinfo = tag->eh_inpinfo; + do + { + delayed->next = edgehole_delay_list; + } + while (!OSCompareAndSwapPtr(delayed->next, delayed, &edgehole_delay_list)); + + thread_call_enter(edgehole_callout); + } +} + +__private_extern__ void +ip_edgehole_attach( + struct inpcb *inp) +{ + inp->inpcb_edgehole_flags = 0; + inp->inpcb_edgehole_mask = 0; + + // TBD: call MAC framework to find out of we are allowed to use EDGE +#ifdef TEST_THE_EVIL_EDGE_HOLE + char pidname[64]; + proc_selfname(pidname, sizeof(pidname)); + pidname[sizeof(pidname) -1] = 0; + if (strcmp(pidname, "MobileSafari") == 0 || + strcmp(pidname, "ping") == 0) + { + inp->inpcb_edgehole_flags = kEdgeHoleFlag_BlockInternet; + inp->inpcb_edgehole_mask = kEdgeHoleFlag_BlockInternet; + } +#endif + + if (inp->inpcb_edgehole_mask != 0) + { + // Allocate a callout + if (edgehole_callout == NULL) + { + thread_call_t tmp_callout = thread_call_allocate(ip_edgehole_process_delayed, NULL); + if (!tmp_callout) panic("ip_edgehole_attach: thread_call_allocate failed"); + if (!OSCompareAndSwapPtr(NULL, tmp_callout, &edgehole_callout)) + thread_call_free(tmp_callout); + } + + // Allocate a malloc tag + if (edgehole_mtag == 0) + { + OSMallocTag mtag = OSMalloc_Tagalloc("com.apple.ip_edgehole", 0); + if (!mtag) panic("ip_edgehole_attach: OSMalloc_Tagalloc failed"); + if (!OSCompareAndSwapPtr(NULL, mtag, &edgehole_mtag)) + OSMalloc_Tagfree(mtag); + } + } +} + +__private_extern__ void +ip_edgehole_mbuf_tag( + struct inpcb *inp, + mbuf_t m) +{ + // Immediately bail if there are no flags on this inpcb + if (inp->inpcb_edgehole_mask == 0) + { + return; + } + + // Allocate a tag_id if we don't have one already + if (edgehole_tag == 0) + mbuf_tag_id_find("com.apple.edgehole", &edgehole_tag); + + struct edgehole_tag *tag; + size_t length; + + // Find an existing tag + if (mbuf_tag_find(m, edgehole_tag, 0, &length, (void**)&tag) == 0) + { + if (length != sizeof(*tag)) + panic("ip_edgehole_mbuf_tag - existing tag is wrong size"); + + // add restrictions + tag->eh_flags = (tag->eh_flags & (~inp->inpcb_edgehole_mask)) | + (inp->inpcb_edgehole_flags & inp->inpcb_edgehole_mask); + } + else if ((inp->inpcb_edgehole_mask & inp->inpcb_edgehole_flags) != 0) + { + // Add the tag + if (mbuf_tag_allocate(m, edgehole_tag, 0, sizeof(*tag), MBUF_WAITOK, (void**)&tag) != 0) + panic("ip_edgehole_mbuf_tag - mbuf_tag_allocate failed"); // ouch - how important is it that we block this stuff? + + tag->eh_flags = (inp->inpcb_edgehole_flags & inp->inpcb_edgehole_mask); + tag->eh_inp = inp; + tag->eh_inpinfo = inp->inp_pcbinfo; + } +} + +int +ip_edgehole_filter( + mbuf_t *m, + __unused int isVV) +{ + struct edgehole_tag *tag; + size_t length; + + if (mbuf_tag_find(*m, edgehole_tag, 0, &length, (void**)&tag) == 0) + { + if (length != sizeof(*tag)) + panic("ip_edgehole_filter - existing tag is wrong size"); + + if ((tag->eh_flags & kEdgeHoleFlag_BlockInternet) != 0) + { + ip_edgehole_notify(tag); + + mbuf_freem(*m); *m = NULL; + return EPERM; + } + } + + return 0; +} diff --git a/bsd/netinet/ip_edgehole.h b/bsd/netinet/ip_edgehole.h new file mode 100644 index 000000000..5bfe7a05b --- /dev/null +++ b/bsd/netinet/ip_edgehole.h @@ -0,0 +1,17 @@ +#include + +struct inpcb; + +// Tag an mbuf on the way out with the edge flags from the inpcb +extern void ip_edgehole_mbuf_tag(struct inpcb *inp, mbuf_t m); + +// Attach the edge flags to the inpcb +extern void ip_edgehole_attach(struct inpcb *inp); + +// Called by the edge interface to determine if the edge interface +// should drop the packet. Will return 0 if the packet should continue +// to be processed or EPERM if ip_edgehole_filter swallowed the packet. +// When ip_edgehole_filter swallows a packet, it frees it and sets your +// pointer to it to NULL. isVV should be set to zero unless the edge +// interface in question is the visual voicemail edge interface. +extern int ip_edgehole_filter(mbuf_t *m, int isVV); diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index e30687513..533184f4e 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -381,6 +381,10 @@ rip_output(m, so, dst) mac_mbuf_label_associate_inpcb(inp, m); #endif +#if CONFIG_IP_EDGEHOLE + ip_edgehole_mbuf_tag(inp, m); +#endif + #if CONFIG_FORCE_OUT_IFP return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags, inp->inp_moptions, inp->pdp_ifp)); diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 59dd0cb78..36756785d 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1734,6 +1734,14 @@ tcp_input(m, off0) /* ECN-setup SYN */ tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } +#ifdef IFEF_NOWINDOWSCALE + if (m->m_pkthdr.rcvif != NULL && + (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE) != 0) + { + // Timestamps are not enabled on this interface + tp->t_flags &= ~(TF_REQ_SCALE); + } +#endif goto trimthenstep6; } @@ -2393,44 +2401,72 @@ tcp_input(m, off0) tp->t_dupacks = 0; break; } + + if (!IN_FASTRECOVERY(tp)) { + /* + * We were not in fast recovery. Reset the duplicate ack + * counter. + */ + tp->t_dupacks = 0; + } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || tp->sack_enable) { - if (IN_FASTRECOVERY(tp)) { + else { + if (tcp_do_newreno || tp->sack_enable) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else - tcp_newreno_partial_ack(tp, th); - } else { - /* - * Out of fast recovery. - * Window inflation should have left us - * with approximately snd_ssthresh - * outstanding data. - * But in case we would be inclined to - * send a burst, better to do it via - * the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, - tp->snd_max)) - tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; + tcp_newreno_partial_ack(tp, th); + } + else { + if (tcp_do_newreno) { + long ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + } + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; } } - } else { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + else { + /* + * Clamp the congestion window to the crossover point + * and exit fast recovery in non-newreno and non-SACK case. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + EXIT_FASTRECOVERY(tp); + tp->t_dupacks = 0; + tp->t_bytes_acked = 0; + } } - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; + + /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index ca32f6b40..e22e04993 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1199,6 +1199,9 @@ tcp_output(struct tcpcb *tp) #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(tp->t_inpcb, m); #endif +#if CONFIG_IP_EDGEHOLE + ip_edgehole_mbuf_tag(tp->t_inpcb, m); +#endif #if INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); @@ -1652,7 +1655,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, unlocked = TRUE; socket_unlock(so, 0); } - + /* * Don't send down a chain of packets when: * - TCP chaining is disabled diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index f51febfc1..a94f8ad2a 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -608,6 +608,12 @@ tcp_respond( mac_netinet_tcp_reply(m); } #endif + +#if CONFIG_IP_EDGEHOLE + if (tp && tp->t_inpcb) + ip_edgehole_mbuf_tag(tp->t_inpcb, m); +#endif + nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; @@ -1433,11 +1439,6 @@ tcp6_ctlinput(cmd, sa, d) #define ISN_BYTES_PER_SECOND 1048576 -//PWC - md5 routines cause alignment exceptions. Need to figure out why. For now use lame incremental -// isn. how's that for not easily guessable!? - -int pwc_bogus; - tcp_seq tcp_new_isn(tp) struct tcpcb *tp; @@ -1625,7 +1626,7 @@ tcp_mtudisc( /* * Look-up the routing entry to the peer of this inpcb. If no route - * is found and it cannot be allocated the return NULL. This routine + * is found and it cannot be allocated then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ @@ -1675,6 +1676,15 @@ tcp_rtlookup(inp) else tp->t_flags |= TF_PMTUD; +#ifdef IFEF_NOWINDOWSCALE + if (tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL && + (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE) != 0) + { + // Timestamps are not enabled on this interface + tp->t_flags &= ~(TF_REQ_SCALE); + } +#endif + return rt; } diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index ed796e474..d194a867f 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -155,7 +155,11 @@ SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD, #define TCPDEBUG2(req) #endif +#if CONFIG_USESOCKTHRESHOLD __private_extern__ unsigned int tcp_sockthreshold = 64; +#else +__private_extern__ unsigned int tcp_sockthreshold = 0; +#endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW, &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold"); diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 7e97ffd9a..88e5413f5 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -1102,7 +1102,10 @@ udp_output(inp, m, addr, control, p) #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); #endif - + +#if CONFIG_IP_EDGEHOLE + ip_edgehole_mbuf_tag(inp, m); +#endif /* * Calculate data length and get a mbuf @@ -1317,6 +1320,7 @@ udp_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr m_freem(m); return EINVAL; } + return udp_output(inp, m, addr, control, p); } diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 86ea9ada0..0e1d6dc69 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -527,6 +527,9 @@ void ipsec_send_natt_keepalive(struct secasvar *sav); void key_init(void); +static errno_t ipsecif_register_control(void); + + /* * PF_KEY init @@ -561,6 +564,10 @@ key_init(void) LIST_INIT(&spihash[i]); raw_init(); + + /* register ip_if application of kernel control */ + ipsecif_register_control(); + } @@ -8179,3 +8186,655 @@ key_alloc_mbuf(l) return m; } + + +/* ---------------------------------------------------------------------------------- +Application of kernel control for interface creation + +Theory of operation: +ipsecif acts as glue between kernel control sockets and ipsec network interfaces. This +kernel control will register an interface for every client that connects. +ipsec interface do not send or receive packets, an they are intercepted by ipsec before +they reach the interface. ipsec needs interface to attach tunnel ip addresses. +In the future, we may want to change the control mechanism to use PF_KEY to create +interfaces for ipsec +---------------------------------------------------------------------------------- */ + +#include +//#include "if_ip.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include /* Until leopard, our ugly bpf protocol prepend will need this */ +#include +#include +#include + +/* +*/ + +#define IPSECIF_CONTROL_NAME "com.apple.net.ipsecif_control" + +/* Kernel Control functions */ +static errno_t ipsecif_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, + void **unitinfo); +static errno_t ipsecif_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit, + void *unitinfo); +static errno_t ipsecif_ctl_send(kern_ctl_ref kctlref, u_int32_t unit, + void *unitinfo, mbuf_t m, int flags); + +/* Network Interface functions */ +static errno_t ipsecif_output(ifnet_t interface, mbuf_t data); +static errno_t ipsecif_demux(ifnet_t interface, mbuf_t data, char *frame_header, + protocol_family_t *protocol); +static errno_t ipsecif_add_proto(ifnet_t interface, protocol_family_t protocol, + const struct ifnet_demux_desc *demux_array, + u_int32_t demux_count); +static errno_t ipsecif_del_proto(ifnet_t interface, protocol_family_t protocol); +static errno_t ipsecif_ioctl(ifnet_t interface, u_int32_t cmd, void *data); +static errno_t ipsecif_settap(ifnet_t interface, bpf_tap_mode mode, + bpf_packet_func callback); +static void ipsecif_detached(ifnet_t interface); + +/* Protocol handlers */ +static errno_t ipsecif_attach_proto(ifnet_t interface, protocol_family_t proto); +static errno_t ipsecif_proto_input(ifnet_t interface, protocol_family_t protocol, + mbuf_t m, char *frame_header); + +/* Control block allocated for each kernel control connection */ +struct ipsecif_pcb { + kern_ctl_ref ctlref; + u_int32_t unit; + ifnet_t ifp; + bpf_tap_mode mode; + bpf_packet_func tap; +}; + +static kern_ctl_ref ipsecif_kctlref; +static u_int32_t ipsecif_family; +static OSMallocTag ipsecif_malloc_tag; +static SInt32 ipsecif_ifcount = 0; + +/* Prepend length */ +static void* +ipsecif_alloc(size_t size) +{ + size_t *mem = OSMalloc(size + sizeof(size_t), ipsecif_malloc_tag); + + if (mem) { + *mem = size + sizeof(size_t); + mem++; + } + + return (void*)mem; +} + +static void +ipsecif_free(void *ptr) +{ + size_t *size = ptr; + size--; + OSFree(size, *size, ipsecif_malloc_tag); +} + +static errno_t +ipsecif_register_control(void) +{ + struct kern_ctl_reg kern_ctl; + errno_t result = 0; + + /* Create a tag to allocate memory */ + ipsecif_malloc_tag = OSMalloc_Tagalloc(IPSECIF_CONTROL_NAME, OSMT_DEFAULT); + + /* Find a unique value for our interface family */ + result = mbuf_tag_id_find(IPSECIF_CONTROL_NAME, &ipsecif_family); + if (result != 0) { + printf("ipsecif_register_control - mbuf_tag_id_find_internal failed: %d\n", result); + return result; + } + + bzero(&kern_ctl, sizeof(kern_ctl)); + strncpy(kern_ctl.ctl_name, IPSECIF_CONTROL_NAME, sizeof(kern_ctl.ctl_name)); + kern_ctl.ctl_name[sizeof(kern_ctl.ctl_name) - 1] = 0; + kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED; /* Require root */ + kern_ctl.ctl_connect = ipsecif_ctl_connect; + kern_ctl.ctl_disconnect = ipsecif_ctl_disconnect; + kern_ctl.ctl_send = ipsecif_ctl_send; + + result = ctl_register(&kern_ctl, &ipsecif_kctlref); + if (result != 0) { + printf("ipsecif_register_control - ctl_register failed: %d\n", result); + return result; + } + + /* Register the protocol plumbers */ + if ((result = proto_register_plumber(PF_INET, ipsecif_family, + ipsecif_attach_proto, NULL)) != 0) { + printf("ipsecif_register_control - proto_register_plumber(PF_INET, %d) failed: %d\n", + ipsecif_family, result); + ctl_deregister(ipsecif_kctlref); + return result; + } + + /* Register the protocol plumbers */ + if ((result = proto_register_plumber(PF_INET6, ipsecif_family, + ipsecif_attach_proto, NULL)) != 0) { + proto_unregister_plumber(PF_INET, ipsecif_family); + ctl_deregister(ipsecif_kctlref); + printf("ipsecif_register_control - proto_register_plumber(PF_INET6, %d) failed: %d\n", + ipsecif_family, result); + return result; + } + + return 0; +} + +/* Kernel control functions */ + +static errno_t +ipsecif_ctl_connect( + kern_ctl_ref kctlref, + struct sockaddr_ctl *sac, + void **unitinfo) +{ + struct ifnet_init_params ipsecif_init; + struct ipsecif_pcb *pcb; + errno_t result; + + /* kernel control allocates, interface frees */ + pcb = ipsecif_alloc(sizeof(*pcb)); + if (pcb == NULL) + return ENOMEM; + + /* Setup the protocol control block */ + bzero(pcb, sizeof(*pcb)); + *unitinfo = pcb; + pcb->ctlref = kctlref; + pcb->unit = sac->sc_unit; + printf("ipsecif_ctl_connect: creating unit ip%d\n", pcb->unit); + + /* Create the interface */ + bzero(&ipsecif_init, sizeof(ipsecif_init)); + ipsecif_init.name = "ipsec"; + ipsecif_init.unit = pcb->unit; + ipsecif_init.family = ipsecif_family; + ipsecif_init.type = IFT_OTHER; + ipsecif_init.output = ipsecif_output; + ipsecif_init.demux = ipsecif_demux; + ipsecif_init.add_proto = ipsecif_add_proto; + ipsecif_init.del_proto = ipsecif_del_proto; + ipsecif_init.softc = pcb; + ipsecif_init.ioctl = ipsecif_ioctl; + ipsecif_init.set_bpf_tap = ipsecif_settap; + ipsecif_init.detach = ipsecif_detached; + + result = ifnet_allocate(&ipsecif_init, &pcb->ifp); + if (result != 0) { + printf("ipsecif_ctl_connect - ifnet_allocate failed: %d\n", result); + ipsecif_free(pcb); + return result; + } + OSIncrementAtomic(&ipsecif_ifcount); + + /* Set flags and additional information. */ + ifnet_set_mtu(pcb->ifp, 1280); + ifnet_set_flags(pcb->ifp, IFF_UP | IFF_MULTICAST | IFF_BROADCAST, 0xffff); +// ifnet_set_flags(pcb->ifp, IFF_UP | IFF_MULTICAST | IFF_POINTOPOINT, 0xffff); + + /* Attach the interface */ + result = ifnet_attach(pcb->ifp, NULL); + if (result != 0) { + printf("ipsecif_ctl_connect - ifnet_allocate failed: %d\n", result); + ifnet_release(pcb->ifp); + ipsecif_free(pcb); + } + + /* Attach to bpf */ + if (result == 0) + bpfattach(pcb->ifp, DLT_NULL, 4); + + return result; +} + +/* + * These defines are marked private but it's impossible to remove an interface + * without them. + */ +#ifndef SIOCPROTODETACH +#define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ +#endif /* SIOCPROTODETACH */ + +#ifndef SIOCPROTODETACH_IN6 +#define SIOCPROTODETACH_IN6 _IOWR('i', 111, struct in6_ifreq) /* detach proto from interface */ +#endif /* SIOCPROTODETACH */ + + +static errno_t +ipsecif_detach_ip( + ifnet_t interface, + protocol_family_t protocol, + socket_t pf_socket) +{ + errno_t result = EPROTONOSUPPORT; + + /* Attempt a detach */ + if (protocol == PF_INET) { + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d", + ifnet_name(interface), ifnet_unit(interface)); + + result = sock_ioctl(pf_socket, SIOCPROTODETACH, &ifr); + } + else if (protocol == PF_INET6) { + struct in6_ifreq ifr6; + + bzero(&ifr6, sizeof(ifr6)); + snprintf(ifr6.ifr_name, sizeof(ifr6.ifr_name), "%s%d", + ifnet_name(interface), ifnet_unit(interface)); + + result = sock_ioctl(pf_socket, SIOCPROTODETACH_IN6, &ifr6); + } + + return result; +} + +static void +ipsecif_remove_address( + ifnet_t interface, + protocol_family_t protocol, + ifaddr_t address, + socket_t pf_socket) +{ + errno_t result = 0; + + /* Attempt a detach */ + if (protocol == PF_INET) { + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d", + ifnet_name(interface), ifnet_unit(interface)); + result = ifaddr_address(address, &ifr.ifr_addr, sizeof(ifr.ifr_addr)); + if (result != 0) { + printf("ipsecif_remove_address - ifaddr_address failed: %d", result); + } + else { + result = sock_ioctl(pf_socket, SIOCDIFADDR, &ifr); + if (result != 0) { + printf("ipsecif_remove_address - SIOCDIFADDR failed: %d", result); + } + } + } + else if (protocol == PF_INET6) { + struct in6_ifreq ifr6; + + bzero(&ifr6, sizeof(ifr6)); + snprintf(ifr6.ifr_name, sizeof(ifr6.ifr_name), "%s%d", + ifnet_name(interface), ifnet_unit(interface)); + result = ifaddr_address(address, (struct sockaddr*)&ifr6.ifr_addr, + sizeof(ifr6.ifr_addr)); + if (result != 0) { + printf("ipsecif_remove_address - ifaddr_address failed (v6): %d", + result); + } + else { + result = sock_ioctl(pf_socket, SIOCDIFADDR_IN6, &ifr6); + if (result != 0) { + printf("ipsecif_remove_address - SIOCDIFADDR_IN6 failed: %d", + result); + } + } + } +} + +static void +ipsecif_cleanup_family( + ifnet_t interface, + protocol_family_t protocol) +{ + errno_t result = 0; + socket_t pf_socket = NULL; + ifaddr_t *addresses = NULL; + int i; + + if (protocol != PF_INET && protocol != PF_INET6) { + printf("ipsecif_cleanup_family - invalid protocol family %d\n", protocol); + return; + } + + /* Create a socket for removing addresses and detaching the protocol */ + result = sock_socket(protocol, SOCK_DGRAM, 0, NULL, NULL, &pf_socket); + if (result != 0) { + if (result != EAFNOSUPPORT) + printf("ipsecif_cleanup_family - failed to create %s socket: %d\n", + protocol == PF_INET ? "IP" : "IPv6", result); + goto cleanup; + } + + result = ipsecif_detach_ip(interface, protocol, pf_socket); + if (result == 0 || result == ENXIO) { + /* We are done! We either detached or weren't attached. */ + goto cleanup; + } + else if (result != EBUSY) { + /* Uh, not really sure what happened here... */ + printf("ipsecif_cleanup_family - ipsecif_detach_ip failed: %d\n", result); + goto cleanup; + } + + /* + * At this point, we received an EBUSY error. This means there are + * addresses attached. We should detach them and then try again. + */ + result = ifnet_get_address_list_family(interface, &addresses, protocol); + if (result != 0) { + printf("fnet_get_address_list_family(%s%d, 0xblah, %s) - failed: %d\n", + ifnet_name(interface), ifnet_unit(interface), + protocol == PF_INET ? "PF_INET" : "PF_INET6", result); + goto cleanup; + } + + for (i = 0; addresses[i] != 0; i++) { + ipsecif_remove_address(interface, protocol, addresses[i], pf_socket); + } + ifnet_free_address_list(addresses); + addresses = NULL; + + /* + * The addresses should be gone, we should try the remove again. + */ + result = ipsecif_detach_ip(interface, protocol, pf_socket); + if (result != 0 && result != ENXIO) { + printf("ipsecif_cleanup_family - ipsecif_detach_ip failed: %d\n", result); + } + +cleanup: + if (pf_socket != NULL) + sock_close(pf_socket); + + if (addresses != NULL) + ifnet_free_address_list(addresses); +} + +static errno_t +ipsecif_ctl_disconnect( + __unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + void *unitinfo) +{ + struct ipsecif_pcb *pcb = unitinfo; + ifnet_t ifp = pcb->ifp; + errno_t result = 0; + + pcb->ctlref = NULL; + pcb->unit = 0; + + /* + * We want to do everything in our power to ensure that the interface + * really goes away when the socket is closed. We must remove IP/IPv6 + * addresses and detach the protocols. Finally, we can remove and + * release the interface. + */ + ipsecif_cleanup_family(ifp, AF_INET); + ipsecif_cleanup_family(ifp, AF_INET6); + + if ((result = ifnet_detach(ifp)) != 0) { + printf("ipsecif_ctl_disconnect - ifnet_detach failed: %d\n", result); + } + + if ((result = ifnet_release(ifp)) != 0) { + printf("ipsecif_ctl_disconnect - ifnet_release failed: %d\n", result); + } + + return 0; +} + +static inline void +call_bpf_tap( + ifnet_t ifp, + bpf_packet_func tap, + mbuf_t m) +{ + struct m_hdr hack_hdr; + struct mbuf *n; + int af; + + if (!tap) + return; + + af = (((*(char*)(mbuf_data(m))) & 0xf0) >> 4); // 4 or 6 + if(af == 4) { + af = AF_INET; + } + else if (af == 6) { + af = AF_INET6; + } + else { + /* Uh...this ain't right */ + af = 0; + } + + hack_hdr.mh_next = (struct mbuf*)m; + hack_hdr.mh_nextpkt = NULL; + hack_hdr.mh_len = 4; + hack_hdr.mh_data = (char *)⁡ + hack_hdr.mh_type = ((struct mbuf*)m)->m_type; + hack_hdr.mh_flags = 0; + + n = (struct mbuf*)&hack_hdr; + + tap(ifp, (mbuf_t)n); +} + + +static errno_t +ipsecif_ctl_send( + __unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + void *unitinfo, + mbuf_t m, + __unused int flags) +{ + struct ipsecif_pcb *pcb = unitinfo; + struct ifnet_stat_increment_param incs; + errno_t result; + + bzero(&incs, sizeof(incs)); + + mbuf_pkthdr_setrcvif(m, pcb->ifp); + + if (pcb->mode & BPF_MODE_INPUT) { + call_bpf_tap(pcb->ifp, pcb->tap, m); + } + + incs.packets_in = 1; + incs.bytes_in = mbuf_pkthdr_len(m); + result = ifnet_input(pcb->ifp, m, &incs); + if (result != 0) { + ifnet_stat_increment_in(pcb->ifp, 0, 0, 1); + printf("ipsecif_ctl_send - ifnet_input failed: %d\n", result); + mbuf_freem(m); + } + + return 0; +} + +/* Network Interface functions */ +static errno_t +ipsecif_output( + ifnet_t interface, + mbuf_t data) +{ + struct ipsecif_pcb *pcb = ifnet_softc(interface); + errno_t result; + + if (pcb->mode & BPF_MODE_OUTPUT) { + call_bpf_tap(interface, pcb->tap, data); + } + + // no packet should go to the ipsec interface + mbuf_freem(data); + +#if 0 + if (pcb->ctlref) { + int length = mbuf_pkthdr_len(data); + result = ctl_enqueuembuf(pcb->ctlref, pcb->unit, data, CTL_DATA_EOR); + if (result != 0) { + mbuf_freem(data); + printf("ipsecif_output - ctl_enqueuembuf failed: %d\n", result); + ifnet_stat_increment_out(interface, 0, 0, 1); + } + else { + ifnet_stat_increment_out(interface, 1, length, 0); + } + } + else + mbuf_freem(data); +#endif + + return 0; +} + +/* Network Interface functions */ +static errno_t +ipsecif_demux( + __unused ifnet_t interface, + mbuf_t data, + __unused char *frame_header, + protocol_family_t *protocol) +{ + u_int8_t *vers; + + while (data != NULL && mbuf_len(data) < 1) { + data = mbuf_next(data); + } + + if (data != NULL) { + vers = mbuf_data(data); + switch(((*vers) & 0xf0) >> 4) { + case 4: + *protocol = PF_INET; + return 0; + + case 6: + *protocol = PF_INET6; + return 0; + } + } + + return ENOENT; +} + +static errno_t +ipsecif_add_proto( + __unused ifnet_t interface, + protocol_family_t protocol, + __unused const struct ifnet_demux_desc *demux_array, + __unused u_int32_t demux_count) +{ + switch(protocol) { + case PF_INET: + return 0; + case PF_INET6: + return 0; + default: + break; + } + + return ENOPROTOOPT; +} + +static errno_t +ipsecif_del_proto( + __unused ifnet_t interface, + __unused protocol_family_t protocol) +{ + return 0; +} + +static errno_t +ipsecif_ioctl( + __unused ifnet_t interface, + __unused u_int32_t command, + __unused void *data) +{ + errno_t result = 0; + + switch(command) { + case SIOCSIFMTU: + ifnet_set_mtu(interface, ((struct ifreq*)data)->ifr_mtu); + break; + + default: + result = EOPNOTSUPP; + } + + return result; +} + +static errno_t +ipsecif_settap( + ifnet_t interface, + bpf_tap_mode mode, + bpf_packet_func callback) +{ + struct ipsecif_pcb *pcb = ifnet_softc(interface); + + pcb->mode = mode; + pcb->tap = callback; + + return 0; +} + +static void +ipsecif_detached( + ifnet_t interface) +{ + struct ipsecif_pcb *pcb = ifnet_softc(interface); + + ipsecif_free(pcb); + + OSDecrementAtomic(&ipsecif_ifcount); +} + +/* Protocol Handlers */ + +static errno_t +ipsecif_proto_input( + __unused ifnet_t interface, + protocol_family_t protocol, + mbuf_t m, + __unused char *frame_header) +{ + proto_input(protocol, m); + + return 0; +} + +static errno_t +ipsecif_attach_proto( + ifnet_t interface, + protocol_family_t protocol) +{ + struct ifnet_attach_proto_param proto; + errno_t result; + + bzero(&proto, sizeof(proto)); + proto.input = ipsecif_proto_input; + + result = ifnet_attach_protocol(interface, protocol, &proto); + if (result != 0 && result != EEXIST) { + printf("ipsecif_attach_inet - ifnet_attach_protocol %d failed: %d\n", + protocol, result); + } + + return result; +} + diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index c767ae523..68125dd26 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -686,10 +686,14 @@ nfs3_vnop_advlock( /* * Fill in the information structure. + * We set all values to zero with bzero to clear + * out any information in the sockaddr_storage + * and nfs_filehandle contained in msgreq so that + * we will not leak extraneous information out of + * the kernel when calling up to lockd via our mig + * generated routine. */ - msgreq.lmr_answered = 0; - msgreq.lmr_errno = 0; - msgreq.lmr_saved_errno = 0; + bzero(&msgreq, sizeof(msgreq)); msg = &msgreq.lmr_msg; msg->lm_version = LOCKD_MSG_VERSION; msg->lm_flags = 0; diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 7d65f3c4f..d7e3d0c3c 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -572,14 +572,15 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_CASE_PRESERVING) caps |= VOL_CAP_FMT_CASE_PRESERVING; } + /* Note: VOL_CAP_FMT_2TB_FILESIZE is actually used to test for "large file support" */ if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXFILESIZE)) { - /* Is server's max file size at least 2TB? */ - if (nmp->nm_fsattr.nfsa_maxfilesize >= 0x20000000000ULL) + /* Is server's max file size at least 4GB? */ + if (nmp->nm_fsattr.nfsa_maxfilesize >= 0x100000000ULL) caps |= VOL_CAP_FMT_2TB_FILESIZE; } else if (nfsvers >= NFS_VER3) { /* * NFSv3 and up supports 64 bits of file size. - * So, we'll just assume maxfilesize >= 2TB + * So, we'll just assume maxfilesize >= 4GB */ caps |= VOL_CAP_FMT_2TB_FILESIZE; } @@ -780,8 +781,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) if (maxsize < nmp->nm_readdirsize) nmp->nm_readdirsize = maxsize; - nfsm_chain_get_64(error, &nmrep, maxsize); - nmp->nm_fsattr.nfsa_maxfilesize = maxsize; + nfsm_chain_get_64(error, &nmrep, nmp->nm_fsattr.nfsa_maxfilesize); nfsm_chain_adv(error, &nmrep, 2 * NFSX_UNSIGNED); // skip time_delta @@ -906,7 +906,7 @@ nfs_mountroot(void) //PWC hack until we have a real "mount" tool to remount root rw int rw_root=0; int flags = MNT_ROOTFS|MNT_RDONLY; - PE_parse_boot_arg("-rwroot_hack", &rw_root); + PE_parse_boot_argn("-rwroot_hack", &rw_root, sizeof (rw_root)); if(rw_root) { flags = MNT_ROOTFS; diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 3324fe92a..907cc9a64 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -7,6 +7,14 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) +ALLPRODUCTS = AppleTV iPhone MacOSX +PRODUCT = $(shell tconf --product) +EXTRAUNIFDEF = $(foreach x,$(ALLPRODUCTS),$(if $(findstring $(PRODUCT),$(x)),-DPRODUCT_$(x),-UPRODUCT_$(x))) +SINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) +SPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) +KINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) +KPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) + INSTINC_SUBDIRS = \ INSTINC_SUBDIRS_PPC = \ diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index c4881ef33..8cd063f27 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -251,6 +251,32 @@ * used without a prototype in scope. */ +/* These settings are particular to each product. */ +#ifdef KERNEL +#define __DARWIN_ONLY_64_BIT_INO_T 0 +#define __DARWIN_ONLY_UNIX_CONFORMANCE 0 +#define __DARWIN_ONLY_VERS_1050 0 +#else /* !KERNEL */ +#ifdef PRODUCT_AppleTV +/* Product: AppleTV */ +#define __DARWIN_ONLY_64_BIT_INO_T 1 +#define __DARWIN_ONLY_UNIX_CONFORMANCE 1 +#define __DARWIN_ONLY_VERS_1050 1 +#endif /* PRODUCT_AppleTV */ +#ifdef PRODUCT_iPhone +/* Product: iPhone */ +#define __DARWIN_ONLY_64_BIT_INO_T 1 +#define __DARWIN_ONLY_UNIX_CONFORMANCE 1 +#define __DARWIN_ONLY_VERS_1050 1 +#endif /* PRODUCT_iPhone */ +#ifdef PRODUCT_MacOSX +/* Product: MacOSX */ +#define __DARWIN_ONLY_64_BIT_INO_T 0 +/* #undef __DARWIN_ONLY_UNIX_CONFORMANCE (automatically set for 64-bit) */ +#define __DARWIN_ONLY_VERS_1050 0 +#endif /* PRODUCT_MacOSX */ +#endif /* KERNEL */ + /* * The __DARWIN_ALIAS macros are used to do symbol renaming; they allow * legacy code to use the old symbol, thus maintiang binary compatability @@ -269,13 +295,28 @@ * pre-10.5, and it is the default compilation environment, revert the * compilation environment to pre-__DARWIN_UNIX03. */ +#if !defined(__DARWIN_ONLY_UNIX_CONFORMANCE) +# if defined(__LP64__) +# define __DARWIN_ONLY_UNIX_CONFORMANCE 1 +# else /* !__LP64__ */ +# define __DARWIN_ONLY_UNIX_CONFORMANCE 0 +# endif /* __LP64__ */ +#endif /* !__DARWIN_ONLY_UNIX_CONFORMANCE */ + #if !defined(__DARWIN_UNIX03) -# if defined(_DARWIN_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) || defined(__LP64__) || (defined(__arm__) && !defined(KERNEL)) +# if defined(KERNEL) +# define __DARWIN_UNIX03 0 +# elif __DARWIN_ONLY_UNIX_CONFORMANCE # if defined(_NONSTD_SOURCE) -# error "Can't define both _NONSTD_SOURCE and any of _DARWIN_C_SOURCE, _XOPEN_SOURCE, _POSIX_C_SOURCE, or __LP64__" +# error "Can't define _NONSTD_SOURCE when only UNIX conformance is available." # endif /* _NONSTD_SOURCE */ # define __DARWIN_UNIX03 1 -# elif defined(_NONSTD_SOURCE) || defined(KERNEL) +# elif defined(_DARWIN_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) +# if defined(_NONSTD_SOURCE) +# error "Can't define both _NONSTD_SOURCE and any of _DARWIN_C_SOURCE, _XOPEN_SOURCE or _POSIX_C_SOURCE." +# endif /* _NONSTD_SOURCE */ +# define __DARWIN_UNIX03 1 +# elif defined(_NONSTD_SOURCE) # define __DARWIN_UNIX03 0 # else /* default */ # if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) < 1050) @@ -287,28 +328,44 @@ #endif /* !__DARWIN_UNIX03 */ #if !defined(__DARWIN_64_BIT_INO_T) -# if defined(_DARWIN_USE_64_BIT_INODE) +# if defined(KERNEL) +# define __DARWIN_64_BIT_INO_T 0 +# elif defined(_DARWIN_USE_64_BIT_INODE) +# if defined(_DARWIN_NO_64_BIT_INODE) +# error "Can't define both _DARWIN_USE_64_BIT_INODE and _DARWIN_NO_64_BIT_INODE." +# endif /* _DARWIN_NO_64_BIT_INODE */ # define __DARWIN_64_BIT_INO_T 1 -# elif defined(_DARWIN_NO_64_BIT_INODE) || defined(KERNEL) +# elif defined(_DARWIN_NO_64_BIT_INODE) +# if __DARWIN_ONLY_64_BIT_INO_T +# error "Can't define _DARWIN_NO_64_BIT_INODE when only 64-bit inodes are available." +# endif /* __DARWIN_ONLY_64_BIT_INO_T */ # define __DARWIN_64_BIT_INO_T 0 # else /* default */ -# define __DARWIN_64_BIT_INO_T 0 +# if __DARWIN_ONLY_64_BIT_INO_T +# define __DARWIN_64_BIT_INO_T 1 +# else /* !__DARWIN_ONLY_64_BIT_INO_T */ +# define __DARWIN_64_BIT_INO_T 0 +# endif /* __DARWIN_ONLY_64_BIT_INO_T */ # endif #endif /* !__DARWIN_64_BIT_INO_T */ -#if !defined(__DARWIN_NON_CANCELABLE) +#if !defined(__DARWIN_VERS_1050) # if defined(KERNEL) -# define __DARWIN_NON_CANCELABLE 0 +# define __DARWIN_VERS_1050 0 +# elif __DARWIN_ONLY_VERS_1050 +# define __DARWIN_VERS_1050 1 +# elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) >= 1050) +# define __DARWIN_VERS_1050 1 # else /* default */ -# define __DARWIN_NON_CANCELABLE 0 +# define __DARWIN_VERS_1050 0 # endif -#endif /* !__DARWIN_NON_CANCELABLE */ +#endif /* !__DARWIN_VERS_1050 */ -#if !defined(__DARWIN_VERS_1050) -# if !defined(KERNEL) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) >= 1050) -# define __DARWIN_VERS_1050 1 +#if !defined(__DARWIN_NON_CANCELABLE) +# if defined(KERNEL) +# define __DARWIN_NON_CANCELABLE 0 # else /* default */ -# define __DARWIN_VERS_1050 0 +# define __DARWIN_NON_CANCELABLE 0 # endif #endif /* !__DARWIN_NON_CANCELABLE */ @@ -316,19 +373,31 @@ * symbol suffixes used for symbol versioning */ #if __DARWIN_UNIX03 -# if !defined(__LP64__) && !defined(__arm__) -# define __DARWIN_SUF_UNIX03 "$UNIX2003" -# define __DARWIN_SUF_UNIX03_SET 1 -# else /* __LP64__ || __arm__ */ +# if __DARWIN_ONLY_UNIX_CONFORMANCE # define __DARWIN_SUF_UNIX03 /* nothing */ -# define __DARWIN_SUF_UNIX03_SET 0 -# endif /* !__LP64__ && !__arm__ */ +# else /* !__DARWIN_ONLY_UNIX_CONFORMANCE */ +# define __DARWIN_SUF_UNIX03 "$UNIX2003" +# endif /* __DARWIN_ONLY_UNIX_CONFORMANCE */ # if __DARWIN_64_BIT_INO_T -# define __DARWIN_SUF_64_BIT_INO_T "$INODE64" +# if __DARWIN_ONLY_64_BIT_INO_T +# define __DARWIN_SUF_64_BIT_INO_T /* nothing */ +# else /* !__DARWIN_ONLY_64_BIT_INO_T */ +# define __DARWIN_SUF_64_BIT_INO_T "$INODE64" +# endif /* __DARWIN_ONLY_64_BIT_INO_T */ # else /* !__DARWIN_64_BIT_INO_T */ # define __DARWIN_SUF_64_BIT_INO_T /* nothing */ -# endif /* __DARWIN_UNIX03 */ +# endif /* __DARWIN_64_BIT_INO_T */ + +# if __DARWIN_VERS_1050 +# if __DARWIN_ONLY_VERS_1050 +# define __DARWIN_SUF_1050 /* nothing */ +# else /* !__DARWIN_ONLY_VERS_1050 */ +# define __DARWIN_SUF_1050 "$1050" +# endif /* __DARWIN_ONLY_VERS_1050 */ +# else /* !__DARWIN_VERS_1050 */ +# define __DARWIN_SUF_1050 /* nothing */ +# endif /* __DARWIN_VERS_1050 */ # if __DARWIN_NON_CANCELABLE # define __DARWIN_SUF_NON_CANCELABLE "$NOCANCEL" @@ -336,15 +405,8 @@ # define __DARWIN_SUF_NON_CANCELABLE /* nothing */ # endif /* __DARWIN_NON_CANCELABLE */ -# if __DARWIN_VERS_1050 -# define __DARWIN_SUF_1050 "$1050" -# else /* !__DARWIN_VERS_1050 */ -# define __DARWIN_SUF_1050 /* nothing */ -# endif /* __DARWIN_VERS_1050 */ - #else /* !__DARWIN_UNIX03 */ # define __DARWIN_SUF_UNIX03 /* nothing */ -# define __DARWIN_SUF_UNIX03_SET 0 # define __DARWIN_SUF_64_BIT_INO_T /* nothing */ # define __DARWIN_SUF_NON_CANCELABLE /* nothing */ # define __DARWIN_SUF_1050 /* nothing */ @@ -435,7 +497,7 @@ * long doubles. This applies only to ppc; i386 already has long double * support, while ppc64 doesn't have any backwards history. */ -#if defined(__ppc__) +#if defined(__ppc__) # if defined(__LDBL_MANT_DIG__) && defined(__DBL_MANT_DIG__) && \ __LDBL_MANT_DIG__ > __DBL_MANT_DIG__ # if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0 < 1040 @@ -450,7 +512,7 @@ # define __DARWIN_LDBL_COMPAT2(x) /* nothing */ # define __DARWIN_LONG_DOUBLE_IS_DOUBLE 1 # endif -#elif defined(__i386__) || defined(__ppc64__) || defined(__x86_64__) || defined (__arm__) +#elif defined(__i386__) || defined(__ppc64__) || defined(__x86_64__) # define __DARWIN_LDBL_COMPAT(x) /* nothing */ # define __DARWIN_LDBL_COMPAT2(x) /* nothing */ # define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 @@ -471,28 +533,54 @@ * Public darwin-specific feature macros *****************************************/ +/* + * _DARWIN_FEATURE_64_BIT_INODE indicates that the ino_t type is 64-bit, and + * structures modified for 64-bit inodes (like struct stat) will be used. + */ +#if __DARWIN_64_BIT_INO_T +#define _DARWIN_FEATURE_64_BIT_INODE 1 +#endif + /* * _DARWIN_FEATURE_LONG_DOUBLE_IS_DOUBLE indicates when the long double type - * is the same as the double type (ppc only) + * is the same as the double type (ppc and arm only) */ #if __DARWIN_LONG_DOUBLE_IS_DOUBLE #define _DARWIN_FEATURE_LONG_DOUBLE_IS_DOUBLE 1 #endif /* - * _DARWIN_FEATURE_UNIX_CONFORMANCE indicates whether UNIX conformance is on, - * and specifies the conformance level (3 is SUSv3) + * _DARWIN_FEATURE_64_ONLY_BIT_INODE indicates that the ino_t type may only + * be 64-bit; there is no support for 32-bit ino_t when this macro is defined + * (and non-zero). There is no struct stat64 either, as the regular + * struct stat will already be the 64-bit version. */ -#if __DARWIN_UNIX03 -#define _DARWIN_FEATURE_UNIX_CONFORMANCE 3 +#if __DARWIN_ONLY_64_BIT_INO_T +#define _DARWIN_FEATURE_ONLY_64_BIT_INODE 1 #endif /* - * _DARWIN_FEATURE_64_BIT_INODE indicates that the ino_t type is 64-bit, and - * structures modified for 64-bit inodes (like struct stat) will be used. + * _DARWIN_FEATURE_ONLY_VERS_1050 indicates that only those APIs updated + * in 10.5 exists; no pre-10.5 variants are available. */ -#if __DARWIN_64_BIT_INO_T -#define _DARWIN_FEATURE_64_BIT_INODE 1 +#if __DARWIN_ONLY_VERS_1050 +#define _DARWIN_FEATURE_ONLY_VERS_1050 1 +#endif + +/* + * _DARWIN_FEATURE_ONLY_UNIX_CONFORMANCE indicates only UNIX conforming API + * are available (the legacy BSD APIs are not available) + */ +#if __DARWIN_ONLY_UNIX_CONFORMANCE +#define _DARWIN_FEATURE_ONLY_UNIX_CONFORMANCE 1 +#endif + +/* + * _DARWIN_FEATURE_UNIX_CONFORMANCE indicates whether UNIX conformance is on, + * and specifies the conformance level (3 is SUSv3) + */ +#if __DARWIN_UNIX03 +#define _DARWIN_FEATURE_UNIX_CONFORMANCE 3 #endif #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index b36277729..879dcb22b 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -50,6 +50,8 @@ * DKIOCISFORMATTED is media formatted? * DKIOCISWRITABLE is media writable? * + * DKIOCDISCARD delete unused data + * * DKIOCGETMAXBLOCKCOUNTREAD get maximum block count for reads * DKIOCGETMAXBLOCKCOUNTWRITE get maximum block count for writes * DKIOCGETMAXBYTECOUNTREAD get maximum byte count for reads @@ -62,11 +64,21 @@ * * DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT get minimum segment alignment in bytes * DKIOCGETMAXSEGMENTADDRESSABLEBITCOUNT get maximum segment width in bits + * + * DKIOCGETPHYSICALBLOCKSIZE get device's block size */ typedef struct { - char path[128]; + uint64_t offset; + uint64_t length; + + uint8_t reserved0128[16]; /* reserved, clear to zero */ +} dk_discard_t; + +typedef struct +{ + char path[128]; } dk_firmware_path_t; typedef struct @@ -102,6 +114,8 @@ typedef struct #define DKIOCISFORMATTED _IOR('d', 23, uint32_t) #define DKIOCISWRITABLE _IOR('d', 29, uint32_t) +#define DKIOCDISCARD _IOW('d', 31, dk_discard_t) + #define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, uint64_t) #define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, uint64_t) #define DKIOCGETMAXBYTECOUNTREAD _IOR('d', 70, uint64_t) @@ -115,7 +129,10 @@ typedef struct #define DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT _IOR('d', 74, uint64_t) #define DKIOCGETMAXSEGMENTADDRESSABLEBITCOUNT _IOR('d', 75, uint64_t) +#define DKIOCGETPHYSICALBLOCKSIZE _IOR('d', 77, uint32_t) + #ifdef KERNEL +#define DK_FEATURE_DISCARD 0x00000010 #define DK_FEATURE_FORCE_UNIT_ACCESS 0x00000001 #define DKIOCGETBLOCKCOUNT32 _IOR('d', 25, uint32_t) #define DKIOCSETBLOCKSIZE _IOW('d', 24, uint32_t) diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index 1bed0a7c5..36fb8d7e9 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -116,6 +116,9 @@ struct image_params { #define IMGPF_NONE 0x00000000 /* No flags */ #define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ #define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ +#if CONFIG_EMBEDDED +#undef IMGPF_POWERPC +#endif #define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ #define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index b8f903856..a5251673f 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -191,7 +191,8 @@ __BEGIN_DECLS #define DBG_IOBLUETOOTH 46 /* Bluetooth */ #define DBG_IOFIREWIRE 47 /* FireWire */ #define DBG_IOINFINIBAND 48 /* Infiniband */ -#define DBG_IOCPUPM 49 /* CPU Power Management */ +#define DBG_IOCPUPM 49 /* CPU Power Management */ +#define DBG_IOGRAPHICS 50 /* Graphics */ /* Backwards compatibility */ #define DBG_IOPOINTING DBG_IOHID /* OBSOLETE: Use DBG_IOHID instead */ @@ -214,6 +215,7 @@ __BEGIN_DECLS #define DBG_DRVBLUETOOTH 15 /* Bluetooth */ #define DBG_DRVFIREWIRE 16 /* FireWire */ #define DBG_DRVINFINIBAND 17 /* Infiniband */ +#define DBG_DRVGRAPHICS 18 /* Graphics */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 80f813839..9019d694b 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include /* XXX needed for user builds */ +#include #else #include #endif @@ -112,8 +113,12 @@ typedef struct fsid { int32_t val[2]; } fsid_t; /* file system id type */ uint32_t f_reserved[8]; /* For future use */ \ } +#if !__DARWIN_ONLY_64_BIT_INO_T + struct statfs64 __DARWIN_STRUCT_STATFS64; +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ + #if __DARWIN_64_BIT_INO_T struct statfs __DARWIN_STRUCT_STATFS64; @@ -636,15 +641,23 @@ typedef struct fhandle fhandle_t; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fstatfs(int, struct statfs *) __DARWIN_INODE64(fstatfs); -int fstatfs64(int, struct statfs64 *); +#if !__DARWIN_ONLY_64_BIT_INO_T +int fstatfs64(int, struct statfs64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, int, int) __DARWIN_INODE64(getfsstat); -int getfsstat64(struct statfs64 *, int, int); +#if !__DARWIN_ONLY_64_BIT_INO_T +int getfsstat64(struct statfs64 *, int, int) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ int getmntinfo(struct statfs **, int) __DARWIN_INODE64(getmntinfo); -int getmntinfo64(struct statfs64 **, int); +#if !__DARWIN_ONLY_64_BIT_INO_T +int getmntinfo64(struct statfs64 **, int) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ int mount(const char *, const char *, int, void *); int statfs(const char *, struct statfs *) __DARWIN_INODE64(statfs); -int statfs64(const char *, struct statfs64 *); +#if !__DARWIN_ONLY_64_BIT_INO_T +int statfs64(const char *, struct statfs64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ int unmount(const char *, int); int getvfsbyname(const char *, struct vfsconf *); __END_DECLS diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 1ff74ca3e..eb5966172 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -123,6 +123,7 @@ struct mount { lck_rw_t mnt_rwlock; /* mutex readwrite lock */ lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ vnode_t mnt_devvp; /* the device mounted on for local file systems */ + uint32_t mnt_devbsdunit; /* the BSD unit number of the device */ int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ @@ -174,8 +175,6 @@ struct mount { */ pid_t mnt_dependent_pid; void *mnt_dependent_process; - - struct timeval last_normal_IO_timestamp; }; /* @@ -340,6 +339,15 @@ struct user_statfs { #endif }; +/* + * throttle I/Os are affected only by normal I/Os happening on the same bsd device node. For example, disk1s3 and + * disk1s5 are the same device node, while disk1s3 and disk2 are not (although disk2 might be a mounted disk image file + * and the disk image file resides on a partition in disk1). The following constant defines the maximum number of + * different bsd device nodes the algorithm can consider, and larger numbers are rounded by this maximum. Since + * throttled I/O is usually useful in non-server environment only, a small number 16 is enough in most cases + */ +#define LOWPRI_MAX_NUM_DEV 16 + __BEGIN_DECLS extern int mount_generation; @@ -377,6 +385,11 @@ void mount_iterdrop(mount_t); void mount_iterdrain(mount_t); void mount_iterreset(mount_t); +/* throttled I/O api */ +int throttle_get_io_policy(struct uthread **ut); +extern void throttle_lowpri_io(boolean_t ok_to_sleep); +int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit); + __END_DECLS #endif /* !_SYS_MOUNT_INTERNAL_H_ */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 9d41f7ca7..6395fa2cf 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -288,7 +288,10 @@ extern int IS_64BIT_PROCESS(proc_t); extern int tsleep(void *chan, int pri, const char *wmesg, int timo); extern int msleep1(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, u_int64_t timo); -#endif + +extern int proc_pidversion(proc_t); +extern int proc_getcdhash(proc_t, unsigned char *); +#endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 08f5fe12c..280e812b1 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -81,7 +81,9 @@ __BEGIN_DECLS #include __END_DECLS +#if DEBUG #define __PROC_INTERNAL_DEBUG 1 +#endif /* * The short form for various locks that protect fields in the data structures. @@ -327,6 +329,7 @@ struct proc { struct timeval p_start; /* starting time */ void * p_rcall; int p_ractive; + int p_idversion; /* version of process identity */ #if DIAGNOSTIC unsigned int p_fdlock_pc[4]; unsigned int p_fdunlock_pc[4]; diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index f60c65821..ab921b8ae 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,6 +74,8 @@ #include #ifdef KERNEL #include +#else /* !KERNEL */ +#include #endif /* KERNEL */ /* [XSI] The timespec structure may be defined as described in */ @@ -264,8 +266,12 @@ struct stat { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if !__DARWIN_ONLY_64_BIT_INO_T + struct stat64 __DARWIN_STRUCT_STAT64; +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -537,15 +543,18 @@ int mkdirx_np(const char *, filesec_t); int mkfifox_np(const char *, filesec_t); int statx_np(const char *, struct stat *, filesec_t) __DARWIN_INODE64(statx_np); int umaskx_np(filesec_t); -/* The following are simillar to stat and friends except provide struct stat64 instead of struct stat */ -int fstatx64_np(int, struct stat64 *, filesec_t); -int lstatx64_np(const char *, struct stat64 *, filesec_t); -int statx64_np(const char *, struct stat64 *, filesec_t); -int fstat64(int, struct stat64 *); -int lstat64(const char *, struct stat64 *); -int stat64(const char *, struct stat64 *); + +#if !__DARWIN_ONLY_64_BIT_INO_T +/* The following deprecated routines are simillar to stat and friends except provide struct stat64 instead of struct stat */ +int fstatx64_np(int, struct stat64 *, filesec_t) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +int lstatx64_np(const char *, struct stat64 *, filesec_t) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +int statx64_np(const char *, struct stat64 *, filesec_t) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +int fstat64(int, struct stat64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +int lstat64(const char *, struct stat64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +int stat64(const char *, struct stat64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); +#endif /* !__DARWIN_ONLY_64_BIT_INO_T */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ __END_DECLS -#endif +#endif /* !KERNEL */ #endif /* !_SYS_STAT_H_ */ diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index d22dfcaab..739eae812 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -68,11 +68,7 @@ int ubc_pages_resident(vnode_t); /* code signing */ struct cs_blob; -int ubc_cs_blob_add(vnode_t, cpu_type_t, off_t, vm_address_t, vm_size_t); struct cs_blob *ubc_cs_blob_get(vnode_t, cpu_type_t, off_t); -struct cs_blob *ubc_get_cs_blobs(vnode_t); -int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); - /* cluster IO routines */ int advisory_read(vnode_t, off_t, off_t, int); diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index eb4d413b5..9ac742bc1 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -139,7 +139,6 @@ __private_extern__ void ubc_init(void); __private_extern__ int ubc_umount(mount_t mp); __private_extern__ void ubc_unmountall(void); __private_extern__ memory_object_t ubc_getpager(vnode_t); -__private_extern__ int ubc_map(vnode_t, int); __private_extern__ void ubc_destroy_named(vnode_t); /* internal only */ @@ -169,6 +168,14 @@ int ubc_getcdhash(vnode_t, off_t, unsigned char *); int UBCINFOEXISTS(vnode_t); +/* code signing */ +struct cs_blob; +int ubc_cs_blob_add(vnode_t, cpu_type_t, off_t, vm_address_t, vm_size_t); +struct cs_blob *ubc_get_cs_blobs(vnode_t); +int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); +kern_return_t ubc_cs_blob_allocate(vm_offset_t *, vm_size_t *); +void ubc_cs_blob_deallocate(vm_offset_t, vm_size_t); + __END_DECLS diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 89dd1cd4f..4aeb5c885 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -172,7 +172,8 @@ struct uthread { u_int32_t dlil_incremented_read; lck_mtx_t *uu_mtx; - int uu_lowpri_window; + int uu_lowpri_window; + size_t uu_devbsdunit; // to identify which device throttled I/Os are sent to struct user_sigaltstack uu_sigstk; int uu_defer_reclaims; @@ -224,7 +225,6 @@ struct uthread { #endif #endif /* CONFIG_DTRACE */ void * uu_threadlist; - mount_t v_mount; }; typedef struct uthread * uthread_t; diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 681ad05f0..7aef5e9e8 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -662,6 +662,9 @@ int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); #define VNODE_ITERATE_ALL 0x80 #define VNODE_ITERATE_ACTIVE 0x100 #define VNODE_ITERATE_INACTIVE 0x200 +#ifdef BSD_KERNEL_PRIVATE +#define VNODE_ALWAYS 0x400 +#endif /* BSD_KERNEL_PRIVATE */ /* * return values from callback diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index ae4be39a9..66e32d7c3 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -413,6 +413,9 @@ int vnode_ref_ext(vnode_t, int); void vnode_rele_ext(vnode_t, int, int); void vnode_rele_internal(vnode_t, int, int, int); int vnode_getwithref(vnode_t); +#ifdef BSD_KERNEL_PRIVATE +int vnode_getalways(vnode_t); +#endif /* BSD_KERNEL_PRIVATE */ int vnode_get_locked(vnode_t); int vnode_put_locked(vnode_t); diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 3a34e1787..9b1a7af25 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -416,6 +416,17 @@ cluster_hard_throttle_on(vnode_t vp) if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) return(1); } + struct uthread *ut; + if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { + size_t devbsdunit; + if (vp->v_mount != NULL) + devbsdunit = vp->v_mount->mnt_devbsdunit; + else + devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + if (throttle_io_will_be_throttled(-1, devbsdunit)) { + return(1); + } + } return(0); } diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index bff33c625..eb070de33 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1706,14 +1706,6 @@ journal_open(struct vnode *jvp, if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n", jdev_name, phys_blksz, jnl->jhdr->jhdr_size); - - orig_blksz = phys_blksz; - phys_blksz = jnl->jhdr->jhdr_size; - if (VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context)) { - printf("jnl: %s: could not set block size to %lu bytes.\n", jdev_name, phys_blksz); - goto bad_journal; - } -// goto bad_journal; } if ( jnl->jhdr->start <= 0 diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 482fb8c46..c5d91125d 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -3389,6 +3389,16 @@ new_vnode(vnode_t *vpp) log(LOG_EMERG, "%d desired, %d numvnodes, " "%d free, %d dead, %d rage\n", desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes); +#if CONFIG_EMBEDDED + /* + * Running out of vnodes tends to make a system unusable. On an + * embedded system, it's unlikely that the user can do anything + * about it (or would know what to do, if they could). So panic + * the system so it will automatically restart (and hopefully we + * can get a panic log that tells us why we ran out). + */ + panic("vnode table is full\n"); +#endif *vpp = NULL; return (ENFILE); } @@ -3558,6 +3568,12 @@ vnode_getwithref(vnode_t vp) } +__private_extern__ int +vnode_getalways(vnode_t vp) +{ + return(vget_internal(vp, 0, VNODE_ALWAYS)); +} + int vnode_put(vnode_t vp) { @@ -3726,6 +3742,7 @@ vnode_getiocount(vnode_t vp, int vid, int vflags) { int nodead = vflags & VNODE_NODEAD; int nosusp = vflags & VNODE_NOSUSPEND; + int always = vflags & VNODE_ALWAYS; for (;;) { /* @@ -3754,6 +3771,8 @@ vnode_getiocount(vnode_t vp, int vid, int vflags) (vp->v_owner == current_thread())) { break; } + if (always != 0) + break; vnode_lock_convert(vp); if (vp->v_lflag & VL_TERMINATE) { diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 51b67f399..be9bfe17a 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include #include @@ -418,6 +419,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused regi strncpy(mp->mnt_vfsstat.f_mntonname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); mp->mnt_vnodecovered = vp; mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx)); + mp->mnt_devbsdunit = LOWPRI_MAX_NUM_DEV - 1; /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */ vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE); @@ -590,6 +592,11 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused regi goto out3; } #endif + if (device_vnode != NULL) { + VNOP_IOCTL(device_vnode, DKIOCGETBSDUNIT, (caddr_t)&mp->mnt_devbsdunit, 0, NULL); + mp->mnt_devbsdunit %= LOWPRI_MAX_NUM_DEV; + } + /* * Mount the filesystem. */ @@ -1020,6 +1027,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) int needwakeup = 0; int forcedunmount = 0; int lflags = 0; + struct vnode *devvp = NULLVP; if (flags & MNT_FORCE) forcedunmount = 1; @@ -1115,10 +1123,14 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) OSAddAtomic(1, (SInt32 *)&vfs_nummntops); if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) { - mp->mnt_devvp->v_specflags &= ~SI_MOUNTEDON; - VNOP_CLOSE(mp->mnt_devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE, + /* hold an io reference and drop the usecount before close */ + devvp = mp->mnt_devvp; + vnode_clearmountedon(devvp); + vnode_getalways(devvp); + vnode_rele(devvp); + VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE, ctx); - vnode_rele(mp->mnt_devvp); + vnode_put(devvp); } lck_rw_done(&mp->mnt_rwlock); mount_list_remove(mp); @@ -4691,6 +4703,7 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) struct nameidata fromnd, tond; vfs_context_t ctx = vfs_context_current(); int error; + int do_retry; int mntrename; int need_event; const char *oname; @@ -4702,6 +4715,7 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) fse_info from_finfo, to_finfo; holding_mntlock = 0; + do_retry = 0; retry: fvp = tvp = NULL; fdvp = tdvp = NULL; @@ -4816,8 +4830,17 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, NULL, vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, - ctx)) != 0) + ctx)) != 0) { + /* + * We could encounter a race where after doing the namei, tvp stops + * being valid. If so, simply re-drive the rename call from the + * top. + */ + if (error == ENOENT) { + do_retry = 1; + } goto auth_exit; + } } else { /* node staying in same directory, must be allowed to add new name */ if ((error = vnode_authorize(fdvp, NULL, @@ -4826,8 +4849,17 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) } /* overwriting tvp */ if ((tvp != NULL) && !vnode_isdir(tvp) && - ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) + ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { + /* + * We could encounter a race where after doing the namei, tvp stops + * being valid. If so, simply re-drive the rename call from the + * top. + */ + if (error == ENOENT) { + do_retry = 1; + } goto auth_exit; + } /* XXX more checks? */ @@ -5071,6 +5103,15 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) holding_mntlock = 0; } if (error) { + /* + * We may encounter a race in the VNOP where the destination didn't + * exist when we did the namei, but it does by the time we go and + * try to create the entry. In this case, we should re-drive this rename + * call from the top again. + */ + if (error == EEXIST) { + do_retry = 1; + } goto out1; } @@ -5158,14 +5199,18 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) vnode_update_identity(fvp, tdvp, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen, tond.ni_cnd.cn_hash, update_flags); } out1: - if (to_name != NULL) - RELEASE_PATH(to_name); - if (from_name != NULL) - RELEASE_PATH(from_name); - + if (to_name != NULL) { + RELEASE_PATH(to_name); + to_name = NULL; + } + if (from_name != NULL) { + RELEASE_PATH(from_name); + from_name = NULL; + } if (holding_mntlock) { mount_unlock_renames(locked_mp); mount_drop(locked_mp, 0); + holding_mntlock = 0; } if (tdvp) { /* @@ -5189,6 +5234,16 @@ rename(__unused proc_t p, struct rename_args *uap, __unused register_t *retval) vnode_put(fvp); vnode_put(fdvp); } + + /* + * If things changed after we did the namei, then we will re-drive + * this rename call from the top. + */ + if(do_retry) { + do_retry = 0; + goto retry; + } + return (error); } diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index 4dafffdf3..a8fc43f7c 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -408,7 +408,7 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) goto bad; } if ( (error = vnode_ref_ext(vp, fmode)) ) { - goto bad; + goto bad2; } /* call out to allow 3rd party notification of open. @@ -419,6 +419,8 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) *fmodep = fmode; return (0); +bad2: + VNOP_CLOSE(vp, fmode, ctx); bad: ndp->ni_vp = NULL; if (vp) { @@ -493,9 +495,16 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) } } #endif + + /* work around for foxhound */ + if (vp->v_type == VBLK) + (void)vnode_rele_ext(vp, flags, 0); + error = VNOP_CLOSE(vp, flags, ctx); - (void)vnode_rele_ext(vp, flags, 0); + if (vp->v_type != VBLK) + (void)vnode_rele_ext(vp, flags, 0); + return (error); } diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 094b6258c..559f83290 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -1036,12 +1036,6 @@ shared_region_map_np( goto done; } - /* - * The mapping was successful. Let the buffer cache know - * that we've mapped that file with these protections. This - * prevents the vnode from getting recycled while it's mapped. - */ - (void) ubc_map(vp, VM_PROT_READ); error = 0; /* update the vnode's access time */ diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index c35f33369..483bda117 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -343,8 +343,6 @@ vnode_pageout(struct vnode *vp, } -extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount); - pager_return_t vnode_pagein( struct vnode *vp, @@ -512,15 +510,15 @@ vnode_pagein( ut = get_bsdthread_info(current_thread()); - if (ut->uu_lowpri_window && ut->v_mount) { + if (ut->uu_lowpri_window) { /* * task is marked as a low priority I/O type - * and the I/O we issued while in this system call + * and the I/O we issued while in this page fault * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ - throttle_lowpri_io(&ut->uu_lowpri_window,ut->v_mount); + throttle_lowpri_io(TRUE); } return (error); } diff --git a/config/MasterVersion b/config/MasterVersion index 64db93b87..38648f07c 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -9.5.0 +9.6.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/System6.0.i386.exports b/config/System6.0.i386.exports index 10c23908d..e558b7286 100644 --- a/config/System6.0.i386.exports +++ b/config/System6.0.i386.exports @@ -1,4 +1,3 @@ -_Cstate_table_set _PE_install_interrupt_handler _PE_interrupt_handler _acpi_install_wake_handler @@ -18,11 +17,8 @@ _mp_cpus_call _mp_rendezvous_no_intrs _mtrr_range_add _mtrr_range_remove -_pmsCPUSetPStateLimit -_pmsCPULoadVIDTable _rtc_clock_stepped _rtc_clock_stepping _smp_initialized -_thread_bind __ZN24IOBufferMemoryDescriptor20initWithPhysicalMaskEP4taskmyyy __ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy diff --git a/config/Unsupported.exports b/config/Unsupported.exports index ae2f63239..8aab26874 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -233,6 +233,7 @@ _sbappendrecord _sbflush _sbspace _securelevel +_sha1_hardware_hook _sleep _soabort _sobind diff --git a/config/Unsupported.i386.exports b/config/Unsupported.i386.exports index 1664fbf31..7720a4d41 100644 --- a/config/Unsupported.i386.exports +++ b/config/Unsupported.i386.exports @@ -1,9 +1,6 @@ _cpu_number _dsmos_page_transform_hook _gPEEFISystemTable -_hpet_get_info -_hpet_register_callback -_hpet_request _in6addr_local _io_map_spec _kdp_register_callout @@ -13,7 +10,6 @@ _m_mtod _ml_get_apicid _ml_get_maxbusdelay _ml_get_maxsnoop -_ml_hpet_cfg _ml_cpu_int_event_time _mp_rendezvous _mp_rendezvous_no_intrs @@ -21,7 +17,6 @@ _nd6_storelladdr _pmCPUControl _pmKextRegister _pm_init_lock -_rdHPET _real_ncpus _rtc_clock_napped _serial_getc diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index b9a6f5b6c..489b2f58d 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -70,4 +70,11 @@ enum { kIOPrepareToPhys32 = 0x04 }; #define kIOPlatformQuiesceActionKey "IOPlatformQuiesceAction" /* value is OSNumber (priority) */ #define kIOPlatformActiveActionKey "IOPlatformActiveAction" /* value is OSNumber (priority) */ +#define kIOPlatformFunctionHandlerSet "IOPlatformFunctionHandlerSet" +#if defined(__i386__) +#define kIOPlatformFunctionHandlerMaxBusDelay "IOPlatformFunctionHandlerMaxBusDelay" +#define kIOPlatformFunctionHandlerMaxInterruptDelay "IOPlatformFunctionHandlerMaxInterruptDelay" +#endif /* defined(__i386__) */ + + #endif /* ! _IOKIT_IOKITKEYSPRIVATE_H */ diff --git a/iokit/IOKit/platform/Makefile b/iokit/IOKit/platform/Makefile index 17669c894..405a7c3eb 100644 --- a/iokit/IOKit/platform/Makefile +++ b/iokit/IOKit/platform/Makefile @@ -12,12 +12,7 @@ include $(MakeInc_def) MI_DIR = platform NOT_EXPORT_HEADERS = -NOT_KF_MI_HEADERS = AppleARMCPU.h AppleARMFunction.h AppleARMIICController.h \ - AppleARMIICDevice.h AppleARMIISController.h \ - AppleARMIISDevice.h AppleARMIO.h AppleARMIODevice.h \ - AppleARMNORFlashController.h AppleARMNORFlashDevice.h \ - AppleARMPE.h AppleARMRTC.h AppleARMSPIController.h \ - AppleARMSPIDevice.h +NOT_KF_MI_HEADERS = INSTINC_SUBDIRS = INSTINC_SUBDIRS_PPC = diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 1d3953308..aa6a0a615 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -361,6 +361,7 @@ enum { #define kIOPMPSHealthConfidenceKey "HealthConfidence" #define kIOPMPSCapacityEstimatedKey "CapacityEstimated" #define kIOPMPSBatteryChargeStatusKey "ChargeStatus" +#define kIOPMPSBatteryTemperatureKey "Temperature" // kIOPMBatteryChargeStatusKey may have one of the following values, or may have // no value. If kIOPMBatteryChargeStatusKey has a NULL value (or no value) associated with it @@ -484,6 +485,7 @@ enum { #define kIOPMSettingDisplaySleepUsesDimKey "Display Sleep Uses Dim" #define kIOPMSettingTimeZoneOffsetKey "TimeZoneOffsetSeconds" #define kIOPMSettingMobileMotionModuleKey "MobileMotionModule" +#define kIOPMSettingGraphicsSwitchKey "GPUSwitch" // Setting controlling drivers can register to receive scheduled wake data // Either in "CF seconds" type, or structured calendar data in a formatted diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 8161a3b99..e32611c99 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -41,5 +41,42 @@ enum { kIOPMSetACAdaptorConnected = (1<<18) }; +/* + * PM notification types + */ + +/* @constant kIOPMStateConsoleUserShutdown + * @abstract Notification of GUI shutdown state available to kexts. + * @discussion This type can be passed as arguments to registerPMSettingController() + * to receive callbacks. + */ +#define kIOPMStateConsoleShutdown "ConsoleShutdown" + +/* @enum ShutdownValues + * @abstract Potential values shared with key kIOPMStateConsoleUserShutdown + */ +enum { +/* @constant kIOPMStateConsoleShutdownNone + * @abstract System shutdown (or restart) hasn't started; system is ON. + * @discussion Next state: 2 + */ + kIOPMStateConsoleShutdownNone = 1, +/* @constant kIOPMStateConsoleShutdownPossible + * @abstract User has been presented with the option to shutdown or restart. Shutdown may be cancelled. + * @discussion Next state may be: 1, 4 + */ + kIOPMStateConsoleShutdownPossible = 2, +/* @constant kIOPMStateConsoleShutdownUnderway + * @abstract Shutdown or restart is proceeding. It may still be cancelled. + * @discussion Next state may be: 1, 4. This state is currently unused. + */ + kIOPMStateConsoleShutdownUnderway = 3, +/* @constant kIOPMStateConsoleShutdownCertain + * @abstract Shutdown is in progress and irrevocable. + * @discussion State remains 4 until power is removed from CPU. + */ + kIOPMStateConsoleShutdownCertain = 4 +}; + #endif /* ! _IOKIT_IOPMPRIVATE_H */ diff --git a/iokit/IOKit/pwr_mgt/IOPMlog.h b/iokit/IOKit/pwr_mgt/IOPMlog.h index 187638627..695c267b1 100644 --- a/iokit/IOKit/pwr_mgt/IOPMlog.h +++ b/iokit/IOKit/pwr_mgt/IOPMlog.h @@ -78,6 +78,7 @@ enum PMLogEnum { kPMLogSetClockGating, // 50 0x051000c8 - platform device specific clock control kPMLogSetPowerGating, // 51 0x051000cc - platform device specific power control kPMLogSetPinGroup, // 52 0x051000d0 - platform device specific gpio control + kPMLogIdleCancel, // 53 0x051000d4 - device unidle during change kIOPMlogLastEvent }; diff --git a/iokit/IOKit/pwr_mgt/Makefile b/iokit/IOKit/pwr_mgt/Makefile index dd2bff563..b1b7a39b0 100644 --- a/iokit/IOKit/pwr_mgt/Makefile +++ b/iokit/IOKit/pwr_mgt/Makefile @@ -15,9 +15,8 @@ NOT_EXPORT_HEADERS = \ IOPMinformee.h \ IOPMinformeeList.h \ IOPMlog.h \ - IOPMPagingPlexus.h \ - IOPMPrivate.h - + IOPMPagingPlexus.h + INSTINC_SUBDIRS = INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = @@ -31,7 +30,7 @@ EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h IOPMDeprecated.h -INSTALL_MI_LCL_LIST = "" +INSTALL_MI_LCL_LIST = IOPMPrivate.h INSTALL_MI_DIR = $(MI_DIR) EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index 144b4c836..9ce7974ed 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -270,6 +270,7 @@ void PE_cpu_machine_quiesce(cpu_id_t target) if (targetCPU) targetCPU->quiesceCPU(); } + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define super IOService diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 939c78a3c..0b5f54a25 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -1609,7 +1609,7 @@ kern_return_t IOCatalogue::removeKernelLinker(void) { goto finish; } - PE_parse_boot_arg("keepsyms", &keepsyms); + PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms)); IOLog("Jettisoning kernel linker.\n"); diff --git a/iokit/Kernel/IOMemoryCursor.cpp b/iokit/Kernel/IOMemoryCursor.cpp index 2487209f8..503fb3a64 100644 --- a/iokit/Kernel/IOMemoryCursor.cpp +++ b/iokit/Kernel/IOMemoryCursor.cpp @@ -72,7 +72,7 @@ IOMemoryCursor::initWithSpecification(SegmentFunction inSegFunc, static UInt sMaxDBDMASegment; if (!sMaxDBDMASegment) { sMaxDBDMASegment = (UInt) -1; - if (PE_parse_boot_arg("mseg", &sMaxDBDMASegment)) + if (PE_parse_boot_argn("mseg", &sMaxDBDMASegment, sizeof (sMaxDBDMASegment))) IOLog("Setting MaxDBDMASegment to %d\n", sMaxDBDMASegment); } diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index b4e86780d..499aa6cc8 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -31,6 +31,8 @@ #include #include #include +#include +#include #define super IOService @@ -926,6 +928,9 @@ OFVariable gOFVariables[] = { {"security-password", kOFVariableTypeData, kOFVariablePermRootOnly, -1}, {"boot-image", kOFVariableTypeData, kOFVariablePermUserWrite, -1}, {"com.apple.System.fp-state", kOFVariableTypeData, kOFVariablePermKernelOnly, -1}, +#if CONFIG_EMBEDDED + {"backlight-level", kOFVariableTypeData, kOFVariablePermUserWrite, -1}, +#endif {0, kOFVariableTypeData, kOFVariablePermUserRead, -1} }; diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 81568ee1e..1ff71887b 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -329,7 +329,7 @@ static UInt32 computeDeltaTimeMS( const AbsoluteTime * startTime ) // expert informs us we are the root. // ********************************************************************************** -#define kRootDomainSettingsCount 14 +#define kRootDomainSettingsCount 16 static SYSCTL_STRUCT(_kern, OID_AUTO, sleeptime, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, @@ -363,7 +363,9 @@ bool IOPMrootDomain::start ( IOService * nub ) OSSymbol::withCString(kIOPMSettingWakeOnACChangeKey), OSSymbol::withCString(kIOPMSettingTimeZoneOffsetKey), OSSymbol::withCString(kIOPMSettingDisplaySleepUsesDimKey), - OSSymbol::withCString(kIOPMSettingMobileMotionModuleKey) + OSSymbol::withCString(kIOPMSettingMobileMotionModuleKey), + OSSymbol::withCString(kIOPMSettingGraphicsSwitchKey), + OSSymbol::withCString(kIOPMStateConsoleShutdown) }; @@ -2305,12 +2307,30 @@ void IOPMrootDomain::tellChangeUp ( unsigned long stateNum) { if ( stateNum == ON_STATE ) { -#if HIBERNATION // Direct callout into OSMetaClass so it can disable kmod unloads // during sleep/wake to prevent deadlocks. OSMetaClassSystemSleepOrWake( kIOMessageSystemHasPoweredOn ); - IOHibernateSystemPostWake(); + if (getPowerState() == ON_STATE) + { + // this is a quick wake from aborted sleep + if (idleSeconds && !wrangler) + { + AbsoluteTime deadline; + sleepASAP = false; + // stay awake for at least idleSeconds + clock_interval_to_deadline(idleSeconds, kSecondScale, &deadline); + thread_call_enter_delayed(extraSleepTimer, deadline); + // this gets turned off when we sleep again + idleSleepPending = true; + } + tellClients(kIOMessageSystemWillPowerOn); + } +#if HIBERNATION + else + { + IOHibernateSystemPostWake(); + } #endif return tellClients(kIOMessageSystemHasPoweredOn); } diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index 1b53461ec..a03ef1d90 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -107,7 +107,7 @@ bool IOPlatformExpert::start( IOService * provider ) return false; // Override the mapper present flag is requested by boot arguments. - if (PE_parse_boot_arg("dart", &debugFlags) && (debugFlags == 0)) + if (PE_parse_boot_argn("dart", &debugFlags, sizeof (debugFlags)) && (debugFlags == 0)) removeProperty(kIOPlatformMapperPresentKey); // Register the presence or lack thereof a system diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index f92941428..f58ea137c 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -153,6 +153,8 @@ const OSSymbol * gIOPlatformWakeActionKey; const OSSymbol * gIOPlatformQuiesceActionKey; const OSSymbol * gIOPlatformActiveActionKey; +const OSSymbol * gIOPlatformFunctionHandlerSet; + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define LOCKREADNOTIFY() \ @@ -206,19 +208,34 @@ bool IOService::isInactive( void ) const /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -#if __i386__ +#if defined(__i386__) // Only used by the intel implementation of -// IOService::requireMaxBusStall(UInt32 __unused ns) -struct BusStallEntry +// IOService::requireMaxBusStall(UInt32 ns) +// IOService::requireMaxInterruptDelay(uint32_t ns) +struct CpuDelayEntry { - const IOService *fService; - UInt32 fMaxDelay; + IOService * fService; + UInt32 fMaxDelay; + UInt32 fDelayType; +}; + +enum { + kCpuDelayBusStall, kCpuDelayInterrupt, + kCpuNumDelayTypes }; -static OSData *sBusStall = OSData::withCapacity(8 * sizeof(BusStallEntry)); -static IOLock *sBusStallLock = IOLockAlloc(); -#endif /* __i386__ */ +static OSData *sCpuDelayData = OSData::withCapacity(8 * sizeof(CpuDelayEntry)); +static IORecursiveLock *sCpuDelayLock = IORecursiveLockAlloc(); +static OSArray *sCpuLatencyHandlers[kCpuNumDelayTypes]; +const OSSymbol *sCPULatencyFunctionName[kCpuNumDelayTypes]; + +static void +requireMaxCpuDelay(IOService * service, UInt32 ns, UInt32 delayType); +static IOReturn +setLatencyHandler(UInt32 delayType, IOService * target, bool enable); + +#endif /* defined(__i386__) */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -288,6 +305,11 @@ void IOService::initialize( void ) gIOPlatformQuiesceActionKey = OSSymbol::withCStringNoCopy(kIOPlatformQuiesceActionKey); gIOPlatformActiveActionKey = OSSymbol::withCStringNoCopy(kIOPlatformActiveActionKey); + gIOPlatformFunctionHandlerSet = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerSet); +#if defined(__i386__) + sCPULatencyFunctionName[kCpuDelayBusStall] = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerMaxBusDelay); + sCPULatencyFunctionName[kCpuDelayInterrupt] = OSSymbol::withCStringNoCopy(kIOPlatformFunctionHandlerMaxInterruptDelay); +#endif gNotificationLock = IORecursiveLockAlloc(); assert( gIOServicePlane && gIODeviceMemoryKey @@ -822,9 +844,23 @@ IOReturn IOService::callPlatformFunction( const OSSymbol * functionName, void *param3, void *param4 ) { IOReturn result = kIOReturnUnsupported; - IOService *provider = getProvider(); - - if (provider != 0) { + IOService *provider; + + if (gIOPlatformFunctionHandlerSet == functionName) + { +#if defined(__i386__) + const OSSymbol * functionHandlerName = (const OSSymbol *) param1; + IOService * target = (IOService *) param2; + bool enable = (param3 != 0); + + if (sCPULatencyFunctionName[kCpuDelayBusStall] == functionHandlerName) + result = setLatencyHandler(kCpuDelayBusStall, target, enable); + else if (sCPULatencyFunctionName[kCpuDelayInterrupt] == param1) + result = setLatencyHandler(kCpuDelayInterrupt, target, enable); +#endif /* defined(__i386__) */ + } + + if ((kIOReturnUnsupported == result) && (provider = getProvider())) { result = provider->callPlatformFunction(functionName, waitForFunction, param1, param2, param3, param4); } @@ -4421,82 +4457,182 @@ void IOService::setDeviceMemory( OSArray * array ) void IOService:: setCPUSnoopDelay(UInt32 __unused ns) { -#if __i386__ +#if defined(__i386__) ml_set_maxsnoop(ns); -#endif /* __i386__ */ +#endif /* defined(__i386__) */ } UInt32 IOService:: getCPUSnoopDelay() { -#if __i386__ +#if defined(__i386__) return ml_get_maxsnoop(); #else return 0; -#endif /* __i386__ */ +#endif /* defined(__i386__) */ } -void IOService:: -requireMaxBusStall(UInt32 __unused ns) +#if defined(__i386__) +static void +requireMaxCpuDelay(IOService * service, UInt32 ns, UInt32 delayType) { -#if __i386__ static const UInt kNoReplace = -1U; // Must be an illegal index UInt replace = kNoReplace; + bool setCpuDelay = false; - IOLockLock(sBusStallLock); + IORecursiveLockLock(sCpuDelayLock); - UInt count = sBusStall->getLength() / sizeof(BusStallEntry); - BusStallEntry *entries = (BusStallEntry *) sBusStall->getBytesNoCopy(); + UInt count = sCpuDelayData->getLength() / sizeof(CpuDelayEntry); + CpuDelayEntry *entries = (CpuDelayEntry *) sCpuDelayData->getBytesNoCopy(); + IOService * holder = NULL; if (ns) { - const BusStallEntry ne = {this, ns}; - - // Set Maximum bus delay. - for (UInt i = 0; i < count; i++) { - const IOService *thisService = entries[i].fService; - if (this == thisService) - replace = i; - else if (!thisService) { - if (kNoReplace == replace) - replace = i; - } - else { - const UInt32 thisMax = entries[i].fMaxDelay; - if (thisMax < ns) - ns = thisMax; - } - } - - // Must be safe to call from locked context - ml_set_maxbusdelay(ns); - - if (kNoReplace == replace) - sBusStall->appendBytes(&ne, sizeof(ne)); - else - entries[replace] = ne; + const CpuDelayEntry ne = {service, ns, delayType}; + holder = service; + // Set maximum delay. + for (UInt i = 0; i < count; i++) { + IOService *thisService = entries[i].fService; + bool sameType = (delayType == entries[i].fDelayType); + if ((service == thisService) && sameType) + replace = i; + else if (!thisService) { + if (kNoReplace == replace) + replace = i; + } + else if (sameType) { + const UInt32 thisMax = entries[i].fMaxDelay; + if (thisMax < ns) + { + ns = thisMax; + holder = thisService; + } + } + } + + setCpuDelay = true; + if (kNoReplace == replace) + sCpuDelayData->appendBytes(&ne, sizeof(ne)); + else + entries[replace] = ne; } else { - ns = -1U; // Set to max unsigned, i.e. no restriction - - for (UInt i = 0; i < count; i++) { - // Clear a maximum bus delay. - const IOService *thisService = entries[i].fService; - UInt32 thisMax = entries[i].fMaxDelay; - if (this == thisService) - replace = i; - else if (thisService && thisMax < ns) - ns = thisMax; + ns = -1U; // Set to max unsigned, i.e. no restriction + + for (UInt i = 0; i < count; i++) { + // Clear a maximum delay. + IOService *thisService = entries[i].fService; + if (thisService && (delayType == entries[i].fDelayType)) { + UInt32 thisMax = entries[i].fMaxDelay; + if (service == thisService) + replace = i; + else if (thisMax < ns) { + ns = thisMax; + holder = thisService; + } + } + } + + // Check if entry found + if (kNoReplace != replace) { + entries[replace].fService = 0; // Null the entry + setCpuDelay = true; + } + } + + if (setCpuDelay) + { + // Must be safe to call from locked context + if (delayType == kCpuDelayBusStall) + { + ml_set_maxbusdelay(ns); + } + else if (delayType == kCpuDelayInterrupt) + { + ml_set_maxintdelay(ns); + } + + OSArray * handlers = sCpuLatencyHandlers[delayType]; + IOService * target; + if (handlers) for (unsigned int idx = 0; + (target = (IOService *) handlers->getObject(idx)); + idx++) + { + target->callPlatformFunction(sCPULatencyFunctionName[delayType], false, + (void *) (uintptr_t) ns, holder, + NULL, NULL); } + } - // Check if entry found - if (kNoReplace != replace) { - entries[replace].fService = 0; // Null the entry - ml_set_maxbusdelay(ns); + IORecursiveLockUnlock(sCpuDelayLock); +} + +static IOReturn +setLatencyHandler(UInt32 delayType, IOService * target, bool enable) +{ + IOReturn result = kIOReturnNotFound; + OSArray * array; + unsigned int idx; + + IORecursiveLockLock(sCpuDelayLock); + + do + { + if (enable && !sCpuLatencyHandlers[delayType]) + sCpuLatencyHandlers[delayType] = OSArray::withCapacity(4); + array = sCpuLatencyHandlers[delayType]; + if (!array) + break; + idx = array->getNextIndexOfObject(target, 0); + if (!enable) + { + if (-1U != idx) + { + array->removeObject(idx); + result = kIOReturnSuccess; + } + } + else + { + if (-1U != idx) { + result = kIOReturnExclusiveAccess; + break; + } + array->setObject(target); + + UInt count = sCpuDelayData->getLength() / sizeof(CpuDelayEntry); + CpuDelayEntry *entries = (CpuDelayEntry *) sCpuDelayData->getBytesNoCopy(); + UInt32 ns = -1U; // Set to max unsigned, i.e. no restriction + IOService * holder = NULL; + + for (UInt i = 0; i < count; i++) { + if (entries[i].fService + && (delayType == entries[i].fDelayType) + && (entries[i].fMaxDelay < ns)) { + ns = entries[i].fMaxDelay; + holder = entries[i].fService; + } + } + target->callPlatformFunction(sCPULatencyFunctionName[delayType], false, + (void *) (uintptr_t) ns, holder, + NULL, NULL); + result = kIOReturnSuccess; } } + while (false); - IOLockUnlock(sBusStallLock); -#endif /* __i386__ */ + IORecursiveLockUnlock(sCpuDelayLock); + + return (result); +} + +#endif /* defined(__i386__) */ + +void IOService:: +requireMaxBusStall(UInt32 __unused ns) +{ +#if defined(__i386__) + requireMaxCpuDelay(this, ns, kCpuDelayBusStall); +#endif } /* diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index 1f9678fd5..5b5cc0b8a 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -117,6 +117,10 @@ do { \ #define NS_TO_MS(nsec) ((int)((nsec) / 1000000ULL)) +#if CONFIG_EMBEDDED +#define SUPPORT_IDLE_CANCEL 1 +#endif + //********************************************************************************* // PM machine states //********************************************************************************* @@ -1784,6 +1788,7 @@ IOReturn IOService::requestPowerDomainState ( unsigned long computedState; unsigned long theDesiredState; IOService * child; + IOPMRequest * childRequest; if (!initialized) return IOPMNotYetInitialized; @@ -1893,7 +1898,9 @@ IOReturn IOService::requestPowerDomainState ( } // Record the child's desires on the connection. - +#if SUPPORT_IDLE_CANCEL + bool attemptCancel = ((kIOPMPreventIdleSleep & desiredState) && !whichChild->getPreventIdleSleepFlag()); +#endif whichChild->setDesiredDomainState( computedState ); whichChild->setPreventIdleSleepFlag( desiredState & kIOPMPreventIdleSleep ); whichChild->setPreventSystemSleepFlag( desiredState & kIOPMPreventSystemSleep ); @@ -1907,8 +1914,6 @@ IOReturn IOService::requestPowerDomainState ( if (!fWillAdjustPowerState && !fDeviceOverrides) { - IOPMRequest * childRequest; - childRequest = acquirePMRequest( this, kIOPMRequestTypeAdjustPowerState ); if (childRequest) { @@ -1916,6 +1921,16 @@ IOReturn IOService::requestPowerDomainState ( fWillAdjustPowerState = true; } } +#if SUPPORT_IDLE_CANCEL + if (attemptCancel) + { + childRequest = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); + if (childRequest) + { + submitPMRequest( childRequest ); + } + } +#endif return IOPMNoErr; } @@ -3898,11 +3913,7 @@ bool IOService::ackTimerTick( void ) // apps didn't respond in time cleanClientResponses(true); OUR_PMLog(kPMLogClientTardy, 0, 1); - if (fMachineState == kIOPM_OurChangeTellClientsPowerDown) - { - // tardy equates to veto - fDoNotPowerDown = true; - } + // tardy equates to approval done = true; break; @@ -4855,6 +4866,11 @@ IOReturn IOService::cancelPowerChange ( unsigned long refcon ) return kIOReturnSuccess; } + OSString * name = IOCopyLogNameForPID(proc_selfpid()); + PM_ERROR("PM notification cancel (%s)\n", name ? name->getCStringNoCopy() : ""); + if (name) + name->release(); + request = acquirePMRequest( this, kIOPMRequestTypeCancelPowerChange ); if (!request) { @@ -5376,11 +5392,6 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) case kIOPM_OurChangeTellClientsPowerDown: // our change, was it vetoed? - if (fDesiredPowerState > fHeadNoteState) - { - PM_DEBUG("%s: idle cancel\n", fName); - fDoNotPowerDown = true; - } if (!fDoNotPowerDown) { // no, we can continue @@ -5388,6 +5399,8 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) } else { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle cancel\n", fName); // yes, rescind the warning tellNoChangeDown(fHeadNoteState); // mark the change note un-actioned @@ -5398,7 +5411,25 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) break; case kIOPM_OurChangeTellPriorityClientsPowerDown: - OurChangeTellPriorityClientsPowerDown(); + // our change, should it be acted on still? +#if SUPPORT_IDLE_CANCEL + if (fDoNotPowerDown) + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle revert\n", fName); + // no, tell clients we're back in the old state + tellChangeUp(fCurrentPowerState); + // mark the change note un-actioned + fHeadNoteFlags |= IOPMNotDone; + // and we're done + all_done(); + } + else +#endif + { + // yes, we can continue + OurChangeTellPriorityClientsPowerDown(); + } break; case kIOPM_OurChangeNotifyInterestedDriversWillChange: @@ -5671,6 +5702,20 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q more = true; break; +#if SUPPORT_IDLE_CANCEL + case kIOPMRequestTypeIdleCancel: + if ((fMachineState == kIOPM_OurChangeTellClientsPowerDown) + || (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown)) + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, 0); + fDoNotPowerDown = true; + if (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) + cleanClientResponses(false); + more = true; + } + break; +#endif + default: IOPanic("servicePMReplyQueue: unknown reply type"); } diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 658730e05..21e9361ff 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -462,7 +462,8 @@ enum { kIOPMRequestTypeAckSetPowerState = 0x82, kIOPMRequestTypeAllowPowerChange = 0x83, kIOPMRequestTypeCancelPowerChange = 0x84, - kIOPMRequestTypeInterestChanged = 0x85 + kIOPMRequestTypeInterestChanged = 0x85, + kIOPMRequestTypeIdleCancel = 0x86 }; //********************************************************************************* diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index 25c13cb0e..42003bf24 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -52,7 +52,6 @@ extern "C" { extern void OSlibkernInit (void); -extern void ml_hpet_cfg(uint32_t, uint32_t); #include #include @@ -100,7 +99,7 @@ void StartIOKit( void * p1, void * p2, void * p3, void * p4 ) OSCollectionIterator * kmodIter; // must release OSString * kmodName; // don't release - if( PE_parse_boot_arg( "io", &debugFlags )) + if( PE_parse_boot_argn( "io", &debugFlags, sizeof (debugFlags) )) gIOKitDebug = debugFlags; // Check for the log synchronous bit set in io diff --git a/kgmacros b/kgmacros index 8da2cc9b2..1fa767c15 100644 --- a/kgmacros +++ b/kgmacros @@ -1960,6 +1960,7 @@ define showuserstack _kgm_update_loop end else + if ($kgm_mtype == 7) set $newact = (struct thread *) $arg0 #This needs to identify 64-bit processes as well set $newiss = (x86_saved_state32_t) ($newact->machine.pcb->iss.uss.ss_32) @@ -1975,6 +1976,9 @@ define showuserstack _kgm_flush_loop _kgm_update_loop end + else + echo showuserstack not supported on this architecture\n + end end end document showuserstack @@ -2053,11 +2057,15 @@ define switchtocorethread flushstack set $pc = $newact->machine->pcb.save_srr0 else + if ($kgm_mtype == 7) set $kgm_cstatep = (struct x86_kernel_state32 *) \ ($newact->kernel_stack + 0x4000 \ - sizeof(struct x86_kernel_state32)) loadcontext $kgm_cstatep flushstack + else + echo switchtocorethread not supported on this architecture\n + end end showcontext_int end @@ -2116,6 +2124,7 @@ define loadcontext set $cr = $kgm_contextp.save_cr set $ctr = $kgm_contextp.save_ctr else + if ($kgm_mtype == 7) set $kgm_contextp = (struct x86_kernel_state32 *) $arg0 set $ebx = $kgm_contextp->k_ebx set $ebp = $kgm_contextp->k_ebp @@ -2123,6 +2132,9 @@ define loadcontext set $esi = $kgm_contextp->k_esi set $eip = $kgm_contextp->k_eip set $pc = $kgm_contextp->k_eip + else + echo loadcontext not supported on this architecture\n + end end end @@ -2146,6 +2158,8 @@ define resetcorectx flushstack set $pc = $kdpstatep->eip update + else + echo resetcorectx not supported on this architecture\n end end showcontext_int @@ -5350,7 +5364,7 @@ define showMCAstate _if_present mca_threshold_status_present printf "\n%d error banks, ", mca_error_bank_count printf "family code 0x%x, ", mca_family - printf "machine-check exception taken: %d\n", mca_exception_taken + printf "machine-check dump state: %d\n", mca_dump_state set $kgm_cpu = 0 while cpu_data_ptr[$kgm_cpu] != 0 set $kgm_mcp = cpu_data_ptr[$kgm_cpu]->cpu_mca_state diff --git a/libkern/crypto/sha1.c b/libkern/crypto/sha1.c index 5e10e07b0..7924e6382 100644 --- a/libkern/crypto/sha1.c +++ b/libkern/crypto/sha1.c @@ -55,6 +55,7 @@ #include #include +#include #include #define memset(x, y, z) bzero(x, z); @@ -141,8 +142,17 @@ static unsigned char PADDING[64] = { 0x80, /* zeros */ }; static void SHA1Transform(u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, const u_int8_t *, SHA1_CTX *); +void _SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen); + void SHA1Final_r(SHA1_CTX *, void *); +typedef kern_return_t (*InKernelPerformSHA1Func)(void *ref, const void *data, size_t dataLen, u_int32_t *inHash, u_int32_t options, u_int32_t *outHash, Boolean usePhysicalAddress); +void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref); +static void *SHA1Ref; +InKernelPerformSHA1Func performSHA1WithinKernelOnly; +#define SHA1_USE_HARDWARE_THRESHOLD 2048 //bytes + + /* * SHA1 initialization. Begins a SHA1 operation, writing a new context. */ @@ -166,7 +176,7 @@ SHA1Init(SHA1_CTX *context) * context. */ void -SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) +_SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) { u_int32_t i, index, partLen; const unsigned char *input = (const unsigned char *)inpp; @@ -210,6 +220,105 @@ SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) memcpy(&context->buffer[index], &input[i], inputLen - i); } + + + +/* + * This function is called by the SHA1 hardware kext during its init. + * This will register the function to call to perform SHA1 using hardware. + */ +void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref) +{ + if(option) { + // Establish the hook. The hardware is ready. + OSCompareAndSwap((uintptr_t)NULL, (uintptr_t)ref, (uintptr_t *)&SHA1Ref); + + if(!OSCompareAndSwap((uintptr_t)NULL, (uintptr_t)func, (uintptr_t *)&performSHA1WithinKernelOnly)) { + panic("sha1_hardware_hook: Called twice.. Should never happen\n"); + } + } + else { + // The hardware is going away. Tear down the hook. + performSHA1WithinKernelOnly = NULL; + SHA1Ref = NULL; + } +} + +static u_int32_t SHA1UpdateWithHardware(SHA1_CTX *context, const unsigned char *data, size_t dataLen, Boolean usePhysicalAddress) +{ + u_int32_t *inHashBuffer = context->state; + u_int32_t options = 0; + int result; + + result = performSHA1WithinKernelOnly(SHA1Ref, data, dataLen, inHashBuffer, options, inHashBuffer, usePhysicalAddress); + if(result != KERN_SUCCESS) { + //The hardware failed to hash for some reason. Fall back to software. + return 0; + } + + //Update the context with the total length. + /* Update number of bits */ + if ((context->bcount[1] += (dataLen << 3)) < (dataLen << 3)) + context->bcount[0]++; + context->bcount[0] += (dataLen >> 29); + return dataLen; +} + +/* + * This is function is only called in from the pagefault path or from page_copy(). + * So we assume that we can safely convert the virtual address to the physical address and use it. + * Assumptions: The passed in address(inpp) is a kernel virtual address + * and a physical page has been faulted in. + * The inputLen passed in should always be less than or equal to a page size (4096) + * and inpp should be on a page boundary. + * "performSHA1WithinKernelOnly" is initialized only when the hardware driver exists and is ready. + */ +void SHA1UpdateUsePhysicalAddress(SHA1_CTX *context, const void *inpp, size_t inputLen) +{ + Boolean usePhysicalAddress = TRUE; + if((inputLen == PAGE_SIZE) && performSHA1WithinKernelOnly) { // If hardware exists and is ready. + if(SHA1UpdateWithHardware(context, (const unsigned char *)inpp, inputLen, usePhysicalAddress)) + return; + //else for some reason the hardware failed.. + //fall through to software and try the hash in software. + } + //Use the software implementation since the hardware is absent or + // has not been initialized yet or inputLen != PAGE_SIZE. + _SHA1Update(context, inpp, inputLen); +} + +/* + * A wrapper around _SHA1Update() to pick between software or hardware based SHA1. + * + */ +void SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) +{ + const unsigned char *input = (const unsigned char *)inpp; + Boolean usePhysicalAddress = FALSE; + u_int32_t index; + + if((inputLen > SHA1_USE_HARDWARE_THRESHOLD) && performSHA1WithinKernelOnly) { + index = (context->bcount[1] >> 3) & 0x3F; + if(index != 0) { //bytes left in the context. Handle them first. + u_int32_t partLen = 64 - index; + memcpy(&context->buffer[index], input, partLen); + _SHA1Update(context, inpp, inputLen); + inputLen -= partLen; + input += partLen; + } + + u_int32_t lenForHardware = inputLen & (~0x3F); //multiple of 64 + u_int32_t bytesHashed = 0; + bytesHashed = SHA1UpdateWithHardware(context, input, lenForHardware, usePhysicalAddress); + + inputLen -= bytesHashed; + input += bytesHashed; + } + + //Fall through to the software implementation. + _SHA1Update(context, input, inputLen); +} + /* * For backwards compatibility, sha1_result symbol is mapped to this * routine since it's equivalent to SHA1Final with reversed parameters. diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index fd4bc66b6..ef527fdd7 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -48,6 +48,14 @@ class OSSerialize; #define APPLE_KEXT_VTABLE_PADDING 1 +#if defined(__LP64__) +#define APPLE_KEXT_LEGACY_ABI 0 +#elif defined(__arm__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define APPLE_KEXT_LEGACY_ABI 0 +#else +#define APPLE_KEXT_LEGACY_ABI 1 +#endif + #if APPLE_KEXT_VTABLE_PADDING #define APPLE_KEXT_PAD_METHOD virtual #define APPLE_KEXT_PAD_IMPL(index) gMetaClass.reservedCalled(index) @@ -100,13 +108,14 @@ class OSMetaClassBase #define OSCheckTypeInst(typeinst, inst) \ OSMetaClassBase::checkTypeInst(inst, typeinst) +typedef void (*_ptf_t)(void); + +#if APPLE_KEXT_LEGACY_ABI // Arcane evil code interprets a C++ pointer to function as specified in the // -fapple-kext ABI, i.e. the gcc-2.95 generated code. IT DOES NOT ALLOW // the conversion of functions that are from MULTIPLY inherited classes. -typedef void (*_ptf_t)(void); - static inline _ptf_t _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) { @@ -141,6 +150,43 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) } } +#else /* !APPLE_KEXT_LEGACY_ABI */ + + +// Slightly less arcane and slightly less evil code to do +// the same for kexts compiled with the standard Itanium C++ +// ABI + +static inline _ptf_t +_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) +{ + union { + void (OSMetaClassBase::*fIn)(void); + uintptr_t fVTOffset; + _ptf_t fPFN; + } map; + + map.fIn = func; + + if (map.fVTOffset & 1) { + // virtual + union { + const OSMetaClassBase *fObj; + _ptf_t **vtablep; + } u; + u.fObj = self; + + // Virtual member function so dereference vtable + return *(_ptf_t *)(((uintptr_t)*u.vtablep) + map.fVTOffset - 1); + } else { + // Not virtual, i.e. plain member func + return map.fPFN; + } +} + + +#endif /* !APPLE_KEXT_LEGACY_ABI */ + /*! @function OSMemberFunctionCast @abstract Convert a pointer to a member function to a c-style pointer to function. No warnings are generated. @param type The type of pointer function desired. diff --git a/libkern/libkern/crypto/sha1.h b/libkern/libkern/crypto/sha1.h index 47a6e11c3..8ecb9e2f7 100644 --- a/libkern/libkern/crypto/sha1.h +++ b/libkern/libkern/crypto/sha1.h @@ -60,6 +60,7 @@ typedef struct sha1_ctxt { extern void SHA1Init(SHA1_CTX *); extern void SHA1Update(SHA1_CTX *, const void *, size_t); +extern void SHA1UpdateUsePhysicalAddress(SHA1_CTX *context, const void *inpp, size_t inputLen); extern void SHA1Final(void *, SHA1_CTX *); #ifdef __cplusplus diff --git a/libsa/catalogue.cpp b/libsa/catalogue.cpp index 886094ab6..ab7ce249a 100644 --- a/libsa/catalogue.cpp +++ b/libsa/catalogue.cpp @@ -373,7 +373,7 @@ bool validateExtensionDict(OSDictionary * extension, int index) { goto finish; } - } else if (PE_parse_boot_arg("-x", namep)) { /* safe boot */ + } else if (PE_parse_boot_argn("-x", namep, sizeof (namep))) { /* safe boot */ ineligible_for_safe_boot = true; result = false; goto finish; @@ -502,6 +502,30 @@ OSDictionary * compareExtensionVersions( goto finish; } + if (0 == strcmp("com.apple.driver.AppleIntelCPUPowerManagement", + incumbentName->getCStringNoCopy())) { + /* Special rules. Always favor version 51.0.0 exactly at the + * expense of all other versions newer or older. + */ + if(0 == strcmp(incumbentVersionString->getCStringNoCopy(), "51.0.0")) { + IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with " + " version (%s -> %s).\n" VTRESET, + candidateName->getCStringNoCopy(), + candidateVersionString->getCStringNoCopy(), + incumbentVersionString->getCStringNoCopy()); + winner = incumbent; + goto finish; + } else if (0 == strcmp(candidateVersionString->getCStringNoCopy(), "51.0.0")) { + IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with " + " version (%s -> %s).\n" VTRESET, + candidateName->getCStringNoCopy(), + incumbentVersionString->getCStringNoCopy(), + candidateVersionString->getCStringNoCopy()); + winner = candidate; + goto finish; + } + } + if (candidate_vers > incumbent_vers) { IOLog(VTYELLOW "Replacing extension \"%s\" with newer version " "(%s -> %s).\n" VTRESET, diff --git a/libsyscall/mach/exc_catcher.c b/libsyscall/mach/exc_catcher.c index 507cb1000..a85086519 100644 --- a/libsyscall/mach/exc_catcher.c +++ b/libsyscall/mach/exc_catcher.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include __private_extern__ kern_return_t internal_catch_exception_raise ( @@ -52,7 +52,7 @@ __private_extern__ kern_return_t internal_catch_exception_raise ( static kern_return_t (*func)(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t); if (checkForFunction == 0) { checkForFunction = 1; - _dyld_lookup_and_bind("_catch_exception_raise", (unsigned long *)&func, (void **)0); + func = dlsym(RTLD_DEFAULT, "catch_exception_raise"); } if (func == 0) { /* The user hasn't defined catch_exception_raise in their binary */ diff --git a/libsyscall/mach/exc_catcher_state.c b/libsyscall/mach/exc_catcher_state.c index c372d1c20..efcb5344c 100644 --- a/libsyscall/mach/exc_catcher_state.c +++ b/libsyscall/mach/exc_catcher_state.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include __private_extern__ kern_return_t internal_catch_exception_raise_state ( @@ -55,7 +55,7 @@ __private_extern__ kern_return_t internal_catch_exception_raise_state ( static kern_return_t (*func)(mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); if (checkForFunction == 0) { checkForFunction = 1; - _dyld_lookup_and_bind("_catch_exception_raise_state", (unsigned long *)&func, (void **)0); + func = dlsym(RTLD_DEFAULT, "catch_exception_raise_state"); } if (func == 0) { /* The user hasn't defined catch_exception_raise in their binary */ diff --git a/libsyscall/mach/exc_catcher_state_identity.c b/libsyscall/mach/exc_catcher_state_identity.c index 139b772c2..1e0c5c0df 100644 --- a/libsyscall/mach/exc_catcher_state_identity.c +++ b/libsyscall/mach/exc_catcher_state_identity.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include __private_extern__ kern_return_t internal_catch_exception_raise_state_identity ( @@ -57,7 +57,7 @@ __private_extern__ kern_return_t internal_catch_exception_raise_state_identity ( static kern_return_t (*func)(mach_port_t, mach_port_t, mach_port_t, exception_type_t, exception_data_t, mach_msg_type_number_t, int *, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t *); if (checkForFunction == 0) { checkForFunction = 1; - _dyld_lookup_and_bind("_catch_exception_raise_state_identity", (unsigned long *)&func, (void **)0); + func = dlsym(RTLD_DEFAULT, "catch_exception_raise_state_identity"); } if (func == 0) { /* The user hasn't defined catch_exception_raise in their binary */ diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 6af46de7f..1fa58a067 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -11,6 +11,7 @@ LN = /bin/ln -fs CAT = /bin/cat MKDIR = /bin/mkdir -p FIND = /usr/bin/find +INSTALL = /usr/bin/install TAR = /usr/bin/gnutar STRIP = /usr/bin/strip diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index 5ac6c41b4..fab9fa524 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -220,10 +220,6 @@ export CFLAGS_DEVELOPMENT = export CFLAGS_DEBUG = export CFLAGS_PROFILE = -pg -ifeq ($(ARCH_CONFIG),ARM) -BUILD_STABS = 1 -endif - ifeq ($(BUILD_STABS),1) export CFLAGS_PPC = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \ -mno-altivec -gstabs+ -force_cpusubtype_ALL diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index add2ecf97..9e62069ae 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -72,7 +72,7 @@ $(INSTALL_MI_GEN_FILES): $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR)/% : % ./incmidir/$${filename_strip}; \ if [ -s ./incmidir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./incmidir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./incmidir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -94,7 +94,7 @@ $(INSTALL_KF_MI_GEN_FILES): $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR)/% : % ./kincmidir/$${filename_strip}; \ if [ -s ./kincmidir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./kincmidir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./kincmidir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -116,7 +116,7 @@ $(INSTALL_MI_GEN_LCL_FILES): $(DSTROOT)/$(LCLDIR)/$(INSTALL_MI_DIR)/% : % ./pincmidir/$${filename_strip}; \ if [ -s ./pincmidir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./pincmidir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./pincmidir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -138,7 +138,7 @@ $(INSTALL_KF_MI_LCL_GEN_FILES): $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR)/% : % ./kpincmidir/$${filename_strip}; \ if [ -s ./kpincmidir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./kpincmidir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./kpincmidir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -160,7 +160,7 @@ $(INSTALL_MD_GEN_INC_FILES): $(DSTROOT)/$(INCDIR)/$(INSTALL_MD_DIR)/% : % ./incdir/$${filename_strip}; \ if [ -s ./incdir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./incdir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./incdir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -182,7 +182,7 @@ $(INSTALL_KF_MD_GEN_FILES): $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR)/% : % ./kincdir/$${filename_strip}; \ if [ -s ./kincdir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./kincdir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./kincdir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -205,7 +205,7 @@ $(INSTALL_MD_GEN_LCL_FILES): $(DSTROOT)/$(LCLDIR)/$(INSTALL_MD_DIR)/% : % ./pincdir/$${filename_strip}; \ if [ -s ./pincdir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./pincdir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./pincdir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -228,7 +228,7 @@ $(INSTALL_KF_MD_LCL_GEN_FILES): $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR)/% : % ./kpincdir/$${filename_strip}; \ if [ -s ./kpincdir/$${filename_strip} ]; \ then ( \ - install $(INSTALL_FLAGS) ./kpincdir/$${filename} $(dir $@);\ + $(INSTALL) $(INSTALL_FLAGS) ./kpincdir/$${filename} $(dir $@);\ ); \ else \ echo Header file $< not exported; \ @@ -254,7 +254,7 @@ do_installhdrs_mi: $(INSTALL_MI_GEN_FILES) $(INSTALL_MI_GEN_LCL_FILES) $(INSTALL ./incmidir/$$j.strip; \ if [ -s ./incmidir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./incmidir/$$j $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./incmidir/$$j $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -276,7 +276,7 @@ do_installhdrs_mi: $(INSTALL_MI_GEN_FILES) $(INSTALL_MI_GEN_LCL_FILES) $(INSTALL ./pincmidir/$$j.strip; \ if [ -s ./pincmidir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./pincmidir/$$j $(DSTROOT)/$(LCLDIR)/$(INSTALL_MI_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./pincmidir/$$j $(DSTROOT)/$(LCLDIR)/$(INSTALL_MI_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -298,7 +298,7 @@ do_installhdrs_mi: $(INSTALL_MI_GEN_FILES) $(INSTALL_MI_GEN_LCL_FILES) $(INSTALL ./kincmidir/$$j.strip; \ if [ -s ./kincmidir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./kincmidir/$$j $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./kincmidir/$$j $(DSTROOT)/$(KINCDIR)/$(EXPORT_MI_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -320,7 +320,7 @@ do_installhdrs_mi: $(INSTALL_MI_GEN_FILES) $(INSTALL_MI_GEN_LCL_FILES) $(INSTALL ./kpincmidir/$$j.strip; \ if [ -s ./kpincmidir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./kpincmidir/$$j $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./kpincmidir/$$j $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MI_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -349,7 +349,7 @@ do_installhdrs_md: $(INSTALL_MD_GEN_INC_FILES) $(INSTALL_MD_GEN_LCL_FILES) $(INS ./incdir/$$j.strip; \ if [ -s ./incdir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./incdir/$$j $(DSTROOT)/$(INCDIR)/$(INSTALL_MD_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./incdir/$$j $(DSTROOT)/$(INCDIR)/$(INSTALL_MD_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -371,7 +371,7 @@ do_installhdrs_md: $(INSTALL_MD_GEN_INC_FILES) $(INSTALL_MD_GEN_LCL_FILES) $(INS ./pincdir/$$j.strip; \ if [ -s ./pincdir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./pincdir/$$j $(DSTROOT)/$(LCLDIR)/$(INSTALL_MD_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./pincdir/$$j $(DSTROOT)/$(LCLDIR)/$(INSTALL_MD_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -393,7 +393,7 @@ do_installhdrs_md: $(INSTALL_MD_GEN_INC_FILES) $(INSTALL_MD_GEN_LCL_FILES) $(INS ./kincdir/$$j.strip; \ if [ -s ./kincdir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./kincdir/$$j $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./kincdir/$$j $(DSTROOT)/$(KINCDIR)/$(EXPORT_MD_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -415,7 +415,7 @@ do_installhdrs_md: $(INSTALL_MD_GEN_INC_FILES) $(INSTALL_MD_GEN_LCL_FILES) $(INS ./kpincdir/$$j.strip; \ if [ -s ./kpincdir/$$j.strip ]; \ then ( \ - install $(INSTALL_FLAGS) ./kpincdir/$$j $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR); \ + $(INSTALL) $(INSTALL_FLAGS) ./kpincdir/$$j $(DSTROOT)/$(KPINCDIR)/$(EXPORT_MD_DIR); \ ); \ else \ echo Header file $$j not exported; \ @@ -605,7 +605,7 @@ endif # mach_kernel building rules # do_build_mach_kernel: $(OBJPATH)/kgmacros - $(_v)install $(DATA_INSTALL_FLAGS) $(SRCROOT)/config/version.c $(OBJPATH)/version.c; + $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $(SRCROOT)/config/version.c $(OBJPATH)/version.c; $(_v)$(SRCROOT)/config/newvers.pl $(OBJPATH)/version.c > /dev/null; @echo CC version.o $(_v)${KCC} -c ${filter-out ${${join $@,_CFLAGS_RM}}, ${CFLAGS}} ${${join $@,_CFLAGS_ADD}} ${INCFLAGS} ${${join $@,_INCFLAGS}} $(OBJPATH)/version.c -o $(OBJPATH)/version.o @@ -619,7 +619,7 @@ do_build_mach_kernel: $(OBJPATH)/kgmacros $(_v)$(STRIP) $(STRIP_FLAGS) $(TARGET)/mach_kernel.sys -o $(TARGET)/mach_kernel $(OBJPATH)/kgmacros: $(SRCROOT)/kgmacros - $(_v)$(CP) $? $@ + $(_v)$(INSTALL) $(INSTALL_FLAGS) $? $@ # Special rules to install machine configuration variants @@ -630,7 +630,7 @@ $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TA fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ - install $(FILE_INSTALL_FLAGS) $< $@; \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ else \ if [ ! -e $@ ]; then \ echo >empty_file_$(notdir $@); \ @@ -648,7 +648,14 @@ $(SYMROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TA fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ - install $(FILE_INSTALL_FLAGS) $< $@; \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ + if [ $(BUILD_DWARF) -eq 1 ]; then \ + $(RM) -rf $@.dSYM; \ + $(MKDIR) -p -m 0755 $@.dSYM/$(DSYMBUILDDIR); \ + $(INSTALL) $(INSTALL_FLAGS) \ + $<.dSYM/$(DSYMBUILDDIR)/$(notdir $<) \ + $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ + fi; \ else \ if [ ! -e $@ ]; then \ echo >empty_file_$(notdir $@); \ @@ -675,7 +682,7 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ - install $(FILE_INSTALL_FLAGS) $< $@; \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ else \ if [ ! -e $@ ]; then \ echo >empty_file_$(notdir $@); \ @@ -692,7 +699,7 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc -exec $(RM) -rf {} \; ; \ $(CTFMERGE) -l xnu -o $<.ctfsys \ $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ - install $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \ else \ if [ ! -e $@.ctfsys ]; then \ echo >empty_file_$(notdir $@); \ @@ -722,14 +729,14 @@ $(INSTALL_FILESYS_FILES_GENERIC): $(SYMROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/%.s fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ - install $(INSTALL_FLAGS) $< $@; \ + $(INSTALL) $(INSTALL_FLAGS) $< $@; \ if [ $(BUILD_DWARF) -eq 1 ]; then \ $(DSYMUTIL) $(DSYMUTIL_FLAGS) \ $(TARGET)/mach_kernel.sys \ -o $(TARGET)/mach_kernel.sys.dSYM; \ $(RM) -rf $@.dSYM; \ $(MKDIR) -p -m 0755 $@.dSYM/$(DSYMBUILDDIR); \ - install $(INSTALL_FLAGS) \ + $(INSTALL) $(INSTALL_FLAGS) \ $<.dSYM/$(DSYMBUILDDIR)/$(notdir $<) \ $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ fi; \ @@ -761,7 +768,7 @@ $(INSTALL_FILESYS_FILES_GENERIC): $(SYMROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/%.s -o $@.dSYM/$(DSYMBUILDDIR)/$(notdir $@); \ fi; \ fi - $(CP) $(SOURCE)kgmacros $(SYMROOT)$(INSTALL_FILE_DIR) + $(INSTALL) $(INSTALL_FLAGS) $(SOURCE)kgmacros $(SYMROOT)$(INSTALL_FILE_DIR) INSTALL_DATA_FILES = $(addprefix $(DSTROOT)$(INSTALL_DATA_DIR), $(INSTALL_DATA_LIST)) @@ -769,7 +776,7 @@ $(INSTALL_DATA_FILES): $(DSTROOT)$(INSTALL_DATA_DIR)% : $(SOURCE)/% @echo Installing $< in $@; $(_v)[ -d $(dir $@) ] ||$(MKDIR) $(dir $@); \ $(RM) $(RMFLAGS) $@; \ - install $(DATA_INSTALL_FLAGS) $< $(dir $@); + $(INSTALL) $(DATA_INSTALL_FLAGS) $< $(dir $@); setup_build_install: @echo "[ $(SOURCE) ] make setup_build_install $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" @@ -790,7 +797,7 @@ do_installman: $(INSTALL_MAN_FILES) $(MKDIR) $$man_dir; \ fi; \ echo Installing $(INSTALL_MAN_LIST) in $$man_dir; \ - install $(INSTALL_FLAGS) $(INSTALL_MAN_LIST) $$man_dir; \ + $(INSTALL) $(INSTALL_FLAGS) $(INSTALL_MAN_LIST) $$man_dir; \ if [ -n "$(strip $(INSTALL_MAN_LINKS))" ]; then \ set `echo ${INSTALL_MAN_LINKS}`; \ while : ; do \ @@ -809,7 +816,7 @@ $(INSTALL_MAN_FILES): $(DSTROOT)/$(MANDIR)/$(INSTALL_MAN_DIR)/% : % @true echo Installing $< in $(dir $@) $(_v)$(MKDIR) $(DSTROOT)/$(MANDIR)/$(INSTALL_MAN_DIR); \ $(RM) $(RMFLAGS) $@; \ - install $(INSTALL_FLAGS) $< $(dir $@); + $(INSTALL) $(INSTALL_FLAGS) $< $(dir $@); ifeq ($(INCL_MAKEDEP), TRUE) -include Makedep diff --git a/osfmk/chud/i386/chud_cpu_i386.c b/osfmk/chud/i386/chud_cpu_i386.c index 8510249dc..2ee2e428e 100644 --- a/osfmk/chud/i386/chud_cpu_i386.c +++ b/osfmk/chud/i386/chud_cpu_i386.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/osfmk/chud/i386/chud_osfmk_callback_i386.c b/osfmk/chud/i386/chud_osfmk_callback_i386.c index a3f12cf15..0c0fafa35 100644 --- a/osfmk/chud/i386/chud_osfmk_callback_i386.c +++ b/osfmk/chud/i386/chud_osfmk_callback_i386.c @@ -46,6 +46,7 @@ #include #include +#include #include #include diff --git a/osfmk/conf/MASTER b/osfmk/conf/MASTER index 5f7d87d6d..08c28268c 100644 --- a/osfmk/conf/MASTER +++ b/osfmk/conf/MASTER @@ -222,6 +222,11 @@ options CONFIG_NO_KPRINTF_STRINGS # # options CONFIG_EMBEDDED # +# only execute signed code. Hang this off config_embedded since there's +# nothing more appropriate right now +# +options CONFIG_ENFORCE_SIGNED_CODE # + # jettison_kernel_linker - jettison kernel linker after kernel init; don't wait for kextd to launch options CONFIG_JETTISON_KERNEL_LINKER # @@ -230,3 +235,9 @@ options CONFIG_VC_PROGRESS_WHITE # # secure_kernel - secure kernel from user programs options SECURE_KERNEL # + +# +# code decryption... used on embedded for app protection +# must be set in all the bsd/conf and osfmk/conf MASTER files +# +options CONFIG_CODE_DECRYPTION # diff --git a/osfmk/conf/MASTER.i386 b/osfmk/conf/MASTER.i386 index 5eb745c87..07289a808 100644 --- a/osfmk/conf/MASTER.i386 +++ b/osfmk/conf/MASTER.i386 @@ -9,12 +9,12 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation crypto config_dtrace] +# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation crypto config_dtrace] # DEBUG_KDP = [ RELEASE osf_debug debug ] # DEBUG= [ RELEASE osf_debug debug mach_kdb mach_assert ] # PROFILE = [ RELEASE profile ] # -# EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation crypto ] +# EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation crypto ] # EMBEDDED = [ EMBEDDED_BASE no_printf_str no_kprintf_str no_kdebug ] # DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_dtrace ] # @@ -54,6 +54,7 @@ options MACH_PE # # options DDB # Inline debugger # options MACH_KDB # # options MACH_KDP # KDP # +options CONFIG_SERIAL_KDP # KDP over serial # options PAE options X86_64 options DISPATCH_COUNTS @@ -64,3 +65,9 @@ options DISPATCH_COUNTS # options CONFIG_MACF # Mandatory Access Control Framework #options CONFIG_MACF_MACH # MACF applied to Mach services + +# +# code decryption... used on i386 for DSMOS +# must be set in all the bsd/conf and osfmk/conf MASTER files +# +options CONFIG_CODE_DECRYPTION diff --git a/osfmk/conf/files b/osfmk/conf/files index 9bfe3b7dd..e5a07a963 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -47,6 +47,7 @@ OPTIONS/mach_ipc_test optional mach_ipc_test OPTIONS/mach_kdb optional mach_kdb OPTIONS/mach_kgdb optional mach_kgdb OPTIONS/mach_kdp optional mach_kdp +OPTIONS/config_serial_kdp optional config_serial_kdp OPTIONS/mach_kprof optional mach_kprof OPTIONS/mach_ldebug optional mach_ldebug OPTIONS/mach_mp_debug optional mach_mp_debug @@ -123,6 +124,7 @@ osfmk/ddb/db_write_cmd.c optional mach_kdb osfmk/ddb/tr.c optional mach_tr osfmk/kdp/kdp.c optional mach_kdp osfmk/kdp/kdp_udp.c optional mach_kdp +osfmk/kdp/kdp_serial.c optional config_serial_kdp osfmk/ipc/ipc_entry.c standard osfmk/ipc/ipc_hash.c standard osfmk/ipc/ipc_init.c standard diff --git a/osfmk/conf/files.i386 b/osfmk/conf/files.i386 index 421ab6358..a41da57da 100644 --- a/osfmk/conf/files.i386 +++ b/osfmk/conf/files.i386 @@ -71,7 +71,6 @@ osfmk/i386/trap.c standard osfmk/i386/user_ldt.c standard osfmk/i386/Diagnostics.c standard osfmk/i386/pmCPU.c standard -osfmk/i386/hpet.c standard osfmk/i386/tsc.c standard osfmk/i386/commpage/commpage.c standard @@ -98,6 +97,7 @@ osfmk/i386/commpage/commpage_sigs.c standard osfmk/i386/AT386/conf.c standard osfmk/i386/AT386/model_dep.c standard +osfmk/i386/lapic.c standard osfmk/i386/mp.c standard osfmk/i386/mp_slave_boot.s standard diff --git a/osfmk/console/panic_dialog.c b/osfmk/console/panic_dialog.c index 52a26a331..78754910b 100644 --- a/osfmk/console/panic_dialog.c +++ b/osfmk/console/panic_dialog.c @@ -51,7 +51,7 @@ static int panic_dialog_verify( const struct panicimage * data, unsigned int siz static int pixels_needed_to_blit_digit( int digit ); static void blit_digit( int digit ); static const char * strnstr(const char * s, const char * find, size_t slen); -static void dim_screen(void); +void dim_screen(void); static void panic_blit_rect(unsigned int x, unsigned int y, unsigned int width, unsigned int height, int transparent, const unsigned char * dataPtr); @@ -779,7 +779,7 @@ decode_rle(const unsigned char *dataPtr, unsigned int *quantity, } -static void +void dim_screen(void) { unsigned long *p, *endp, *row; diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index c42f0df5a..4a040103f 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -2317,7 +2317,7 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) new_vinfo.v_baseaddr = newVideoVirt + boot_vinfo->v_offset; /* Set the new framebuffer address */ else new_vinfo.v_baseaddr = lastVideoVirt + boot_vinfo->v_offset; /* Set the new framebuffer address */ - + /* Update the vinfo structure atomically with respect to the vc_progress task if running */ if (vc_progress) { diff --git a/osfmk/ddb/db_command.c b/osfmk/ddb/db_command.c index f1962e8d1..cf99ace6b 100644 --- a/osfmk/ddb/db_command.c +++ b/osfmk/ddb/db_command.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -768,11 +768,6 @@ struct db_command db_command_table[] = { .fcn = db_apic, .flag = CS_MORE, }, - { - .name = "hp", - .fcn = db_hpet, - .flag = CS_MORE, - }, #endif /* !__ppc__ */ #if defined(__ppc__) { diff --git a/osfmk/default_pager/dp_backing_store.c b/osfmk/default_pager/dp_backing_store.c index 78c99073f..5dfcf6952 100644 --- a/osfmk/default_pager/dp_backing_store.c +++ b/osfmk/default_pager/dp_backing_store.c @@ -3565,6 +3565,7 @@ vs_cluster_transfer( if (size == 0) { ASSERT(unavail_size); + ps_clunmap(vs, offset, unavail_size); cnt -= unavail_size; offset += unavail_size; if((offset & ((vm_page_size << vs->vs_clshift) - 1)) @@ -3643,6 +3644,7 @@ vs_cluster_transfer( */ write_vsmap = *vsmap_ptr; *vsmap_ptr = read_vsmap; + ps_clunmap(vs, offset, size); } else { /* discard the old backing object */ write_vsmap = *vsmap_ptr; diff --git a/osfmk/default_pager/dp_memory_object.c b/osfmk/default_pager/dp_memory_object.c index 4690f5d3e..c2e488dce 100644 --- a/osfmk/default_pager/dp_memory_object.c +++ b/osfmk/default_pager/dp_memory_object.c @@ -367,7 +367,8 @@ const struct memory_object_pager_ops default_pager_ops = { dp_memory_object_data_initialize, dp_memory_object_data_unlock, dp_memory_object_synchronize, - dp_memory_object_unmap, + dp_memory_object_map, + dp_memory_object_last_unmap, "default pager" }; @@ -414,11 +415,19 @@ dp_memory_object_synchronize( } kern_return_t -dp_memory_object_unmap( - __unused memory_object_t mem_obj) +dp_memory_object_map( + __unused memory_object_t mem_obj, + __unused vm_prot_t prot) { - panic("dp_memory_object_unmap"); + panic("dp_memory_object_map"); + return KERN_FAILURE; +} +kern_return_t +dp_memory_object_last_unmap( + __unused memory_object_t mem_obj) +{ + panic("dp_memory_object_last_unmap"); return KERN_FAILURE; } diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index bd0182888..73ea3948a 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -152,7 +152,7 @@ machine_startup(void) halt_in_debugger = halt_in_debugger ? 0 : 1; #endif - if (PE_parse_boot_arg("debug", &boot_arg)) { + if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) { if (boot_arg & DB_HALT) halt_in_debugger=1; if (boot_arg & DB_PRT) disable_debug_output=FALSE; if (boot_arg & DB_SLOG) systemLogDiags=TRUE; @@ -160,14 +160,14 @@ machine_startup(void) if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; } - if (!PE_parse_boot_arg("nvram_paniclog", &commit_paniclog_to_nvram)) + if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram))) commit_paniclog_to_nvram = 1; /* * Entering the debugger will put the CPUs into a "safe" * power mode. */ - if (PE_parse_boot_arg("pmsafe_debug", &boot_arg)) + if (PE_parse_boot_argn("pmsafe_debug", &boot_arg, sizeof (boot_arg))) pmsafe_debug = boot_arg; #if NOTYET @@ -199,25 +199,25 @@ machine_startup(void) } #endif /* MACH_KDB */ - if (PE_parse_boot_arg("preempt", &boot_arg)) { + if (PE_parse_boot_argn("preempt", &boot_arg, sizeof (boot_arg))) { default_preemption_rate = boot_arg; } - if (PE_parse_boot_arg("unsafe", &boot_arg)) { + if (PE_parse_boot_argn("unsafe", &boot_arg, sizeof (boot_arg))) { max_unsafe_quanta = boot_arg; } - if (PE_parse_boot_arg("poll", &boot_arg)) { + if (PE_parse_boot_argn("poll", &boot_arg, sizeof (boot_arg))) { max_poll_quanta = boot_arg; } - if (PE_parse_boot_arg("yield", &boot_arg)) { + if (PE_parse_boot_argn("yield", &boot_arg, sizeof (boot_arg))) { sched_poll_yield_shift = boot_arg; } - if (PE_parse_boot_arg("idlehalt", &boot_arg)) { + if (PE_parse_boot_argn("idlehalt", &boot_arg, sizeof (boot_arg))) { idlehalt = boot_arg; } /* The I/O port to issue a read from, in the event of a panic. Useful for * triggering logic analyzers. */ - if (PE_parse_boot_arg("panic_io_port", &boot_arg)) { + if (PE_parse_boot_argn("panic_io_port", &boot_arg, sizeof (boot_arg))) { /*I/O ports range from 0 through 0xFFFF */ panic_io_port = boot_arg & 0xffff; } @@ -968,7 +968,7 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu pbtcpu = cpu_number(); } - PE_parse_boot_arg("keepsyms", &keepsyms); + PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms)); if (msg != NULL) { kdb_printf(msg); diff --git a/osfmk/i386/Diagnostics.c b/osfmk/i386/Diagnostics.c index 34209af9e..74f806a63 100644 --- a/osfmk/i386/Diagnostics.c +++ b/osfmk/i386/Diagnostics.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,7 +69,6 @@ #include #include #include -#include #include extern uint64_t lastNapClear; diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index a5fa0188e..6cae8e5cd 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -16,8 +16,8 @@ EXPORT_ONLY_FILES = \ cpu_topology.h \ cpuid.h \ eflags.h \ - hpet.h \ io_map_entries.h \ + lapic.h \ lock.h \ locks.h \ machine_routines.h \ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 64c21447e..cd5bdbb71 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,11 +34,12 @@ #include #include #include +#include #include #include #include -#include #include +#include #include #include @@ -147,8 +148,8 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) data.refcon = refcon; #endif - /* Save HPET state */ - hpet_save(); + /* Save power management timer state */ + pmTimerSave(); /* * Turn off VT, otherwise switching to legacy mode will fail @@ -212,6 +213,12 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* set up PAT following boot processor power up */ pat_init(); + /* + * Go through all of the CPUs and mark them as requiring + * a full restart. + */ + pmMarkAllCPUsOff(); + /* let the realtime clock reset */ rtc_sleep_wakeup(acpi_sleep_abstime); @@ -220,10 +227,13 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* re-enable and re-init local apic */ if (lapic_probe()) - lapic_init(); + lapic_configure(); + + /* Restore power management register state */ + pmCPUMarkRunning(current_cpu_datap()); - /* Restore HPET state */ - hpet_restore(); + /* Restore power management timer state */ + pmTimerRestore(); /* Restart tick interrupts from the LAPIC timer */ rtc_lapic_start_ticking(); diff --git a/osfmk/i386/apic.h b/osfmk/i386/apic.h index 6cce0663c..971e1d092 100644 --- a/osfmk/i386/apic.h +++ b/osfmk/i386/apic.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,112 +32,6 @@ #ifndef _I386_APIC_H_ #define _I386_APIC_H_ -#define LAPIC_START 0xFEE00000 -#define LAPIC_SIZE 0x00000400 - -#define LAPIC_ID 0x00000020 -#define LAPIC_ID_SHIFT 24 -#define LAPIC_ID_MASK 0x0F -#define LAPIC_VERSION 0x00000030 -#define LAPIC_VERSION_MASK 0xFF -#define LAPIC_TPR 0x00000080 -#define LAPIC_TPR_MASK 0xFF -#define LAPIC_APR 0x00000090 -#define LAPIC_APR_MASK 0xFF -#define LAPIC_PPR 0x000000A0 -#define LAPIC_PPR_MASK 0xFF -#define LAPIC_EOI 0x000000B0 -#define LAPIC_REMOTE_READ 0x000000C0 -#define LAPIC_LDR 0x000000D0 -#define LAPIC_LDR_SHIFT 24 -#define LAPIC_DFR 0x000000E0 -#define LAPIC_DFR_FLAT 0xFFFFFFFF -#define LAPIC_DFR_CLUSTER 0x0FFFFFFF -#define LAPIC_DFR_SHIFT 28 -#define LAPIC_SVR 0x000000F0 -#define LAPIC_SVR_MASK 0x0FF -#define LAPIC_SVR_ENABLE 0x100 -#define LAPIC_SVR_FOCUS_OFF 0x200 -#define LAPIC_ISR_BASE 0x00000100 -#define LAPIC_TMR_BASE 0x00000180 -#define LAPIC_IRR_BASE 0x00000200 -#define LAPIC_ERROR_STATUS 0x00000280 -#define LAPIC_ICR 0x00000300 -#define LAPIC_ICR_VECTOR_MASK 0x000FF -#define LAPIC_ICR_DM_MASK 0x00700 -#define LAPIC_ICR_DM_FIXED 0x00000 -#define LAPIC_ICR_DM_LOWEST 0x00100 -#define LAPIC_ICR_DM_SMI 0x00200 -#define LAPIC_ICR_DM_REMOTE 0x00300 -#define LAPIC_ICR_DM_NMI 0x00400 -#define LAPIC_ICR_DM_INIT 0x00500 -#define LAPIC_ICR_DM_STARTUP 0x00600 -#define LAPIC_ICR_DM_LOGICAL 0x00800 -#define LAPIC_ICR_DS_PENDING 0x01000 -#define LAPIC_ICR_LEVEL_ASSERT 0x04000 -#define LAPIC_ICR_TRIGGER_LEVEL 0x08000 -#define LAPIC_ICR_RR_MASK 0x30000 -#define LAPIC_ICR_RR_INVALID 0x00000 -#define LAPIC_ICR_RR_INPROGRESS 0x10000 -#define LAPIC_ICR_RR_VALID 0x20000 -#define LAPIC_ICR_DSS_MASK 0xC0000 -#define LAPIC_ICR_DSS_DEST 0x00000 -#define LAPIC_ICR_DSS_SELF 0x40000 -#define LAPIC_ICR_DSS_ALL 0x80000 -#define LAPIC_ICR_DSS_OTHERS 0xC0000 -#define LAPIC_ICRD 0x00000310 -#define LAPIC_ICRD_DEST_SHIFT 24 -#define LAPIC_LVT_TIMER 0x00000320 -#define LAPIC_LVT_THERMAL 0x00000330 -#define LAPIC_LVT_PERFCNT 0x00000340 -#define LAPIC_LVT_LINT0 0x00000350 -#define LAPIC_LVT_LINT1 0x00000360 -#define LAPIC_LVT_ERROR 0x00000370 -#define LAPIC_LVT_VECTOR_MASK 0x000FF -#define LAPIC_LVT_DM_SHIFT 8 -#define LAPIC_LVT_DM_MASK 0x00007 -#define LAPIC_LVT_DM_FIXED 0x00000 -#define LAPIC_LVT_DM_NMI 0x00400 -#define LAPIC_LVT_DM_EXTINT 0x00700 -#define LAPIC_LVT_DS_PENDING 0x01000 -#define LAPIC_LVT_IP_PLRITY_LOW 0x02000 -#define LAPIC_LVT_REMOTE_IRR 0x04000 -#define LAPIC_LVT_TM_LEVEL 0x08000 -#define LAPIC_LVT_MASKED 0x10000 -#define LAPIC_LVT_PERIODIC 0x20000 -#define LAPIC_TIMER_INITIAL_COUNT 0x00000380 -#define LAPIC_TIMER_CURRENT_COUNT 0x00000390 -#define LAPIC_TIMER_DIVIDE_CONFIG 0x000003E0 -/* divisor encoded by bits 0,1,3 with bit 2 always 0: */ -#define LAPIC_TIMER_DIVIDE_MASK 0x0000000F -#define LAPIC_TIMER_DIVIDE_2 0x00000000 -#define LAPIC_TIMER_DIVIDE_4 0x00000001 -#define LAPIC_TIMER_DIVIDE_8 0x00000002 -#define LAPIC_TIMER_DIVIDE_16 0x00000003 -#define LAPIC_TIMER_DIVIDE_32 0x00000008 -#define LAPIC_TIMER_DIVIDE_64 0x00000009 -#define LAPIC_TIMER_DIVIDE_128 0x0000000A -#define LAPIC_TIMER_DIVIDE_1 0x0000000B - -#ifndef ASSEMBLER -#include -typedef enum { - periodic, - one_shot -} lapic_timer_mode_t; -typedef enum { - divide_by_1 = LAPIC_TIMER_DIVIDE_1, - divide_by_2 = LAPIC_TIMER_DIVIDE_2, - divide_by_4 = LAPIC_TIMER_DIVIDE_4, - divide_by_8 = LAPIC_TIMER_DIVIDE_8, - divide_by_16 = LAPIC_TIMER_DIVIDE_16, - divide_by_32 = LAPIC_TIMER_DIVIDE_32, - divide_by_64 = LAPIC_TIMER_DIVIDE_64, - divide_by_128 = LAPIC_TIMER_DIVIDE_128 -} lapic_timer_divide_t; -typedef uint32_t lapic_timer_count_t; -#endif /* ASSEMBLER */ - #define IOAPIC_START 0xFEC00000 #define IOAPIC_SIZE 0x00000020 diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index 562b0b392..a870cc503 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -102,6 +102,7 @@ kern_return_t thread_compose_cthread_desc(unsigned int addr, pcb_t pcb); void IOSleep(int); +extern void throttle_lowpri_io(boolean_t); void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64); @@ -394,6 +395,8 @@ machdep_syscall(x86_saved_state_t *state) if (current_thread()->funnel_lock) (void) thread_funnel_set(current_thread()->funnel_lock, FALSE); + throttle_lowpri_io(TRUE); + thread_exception_return(); /* NOTREACHED */ } @@ -432,6 +435,8 @@ machdep_syscall64(x86_saved_state_t *state) if (current_thread()->funnel_lock) (void) thread_funnel_set(current_thread()->funnel_lock, FALSE); + throttle_lowpri_io(TRUE); + thread_exception_return(); /* NOTREACHED */ } @@ -712,6 +717,8 @@ mach_call_munger(x86_saved_state_t *state) retval, 0, 0, 0, 0); regs->eax = retval; + throttle_lowpri_io(TRUE); + thread_exception_return(); /* NOTREACHED */ } @@ -767,6 +774,8 @@ mach_call_munger64(x86_saved_state_t *state) (call_number)) | DBG_FUNC_END, (int)regs->rax, 0, 0, 0, 0); + throttle_lowpri_io(TRUE); + thread_exception_return(); /* NOTREACHED */ } diff --git a/osfmk/i386/commpage/commpage_mach_absolute_time.s b/osfmk/i386/commpage/commpage_mach_absolute_time.s index 3ea04c5f1..60baed63c 100644 --- a/osfmk/i386/commpage/commpage_mach_absolute_time.s +++ b/osfmk/i386/commpage/commpage_mach_absolute_time.s @@ -57,6 +57,8 @@ Lnanotime: jz 0b rdtsc /* get TSC in %edx:%eax */ + lfence + subl _COMM_PAGE_NT_TSC_BASE,%eax sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx @@ -160,6 +162,7 @@ Lnanotime_64: // NB: must preserve r9, r10, and r11 testl %r8d,%r8d // if 0, data is being changed... jz 1b // ...so loop until stable rdtsc // edx:eax := tsc + lfence shlq $32,%rdx // rax := ((edx << 32) | eax), ie 64-bit tsc orq %rdx,%rax subq _NT_TSC_BASE(%rsi), %rax // rax := (tsc - base_tsc) diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index b0f87d7da..1760eabf5 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -113,14 +113,26 @@ cpu_start( if (cpu == cpu_number()) { cpu_machine_init(); return KERN_SUCCESS; - } else { + } + + /* + * Try to bring the CPU back online without a reset. + * If the fast restart doesn't succeed, fall back to + * the slow way. + */ + ret = intel_startCPU_fast(cpu); + if (ret != KERN_SUCCESS) { /* * Should call out through PE. * But take the shortcut here. */ ret = intel_startCPU(cpu); - return(ret); } + + if (ret != KERN_SUCCESS) + kprintf("cpu: cpu_start(%d) returning failure!\n", cpu); + + return(ret); } void @@ -130,7 +142,8 @@ cpu_exit_wait( cpu_data_t *cdp = cpu_datap(cpu); simple_lock(&x86_topo_lock); - while (!cdp->lcpu.halted) { + while ((cdp->lcpu.state != LCPU_HALT) + && (cdp->lcpu.state != LCPU_OFF)) { simple_unlock(&x86_topo_lock); cpu_pause(); simple_lock(&x86_topo_lock); diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index e061888d7..ae05b1767 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -66,20 +66,6 @@ typedef struct rtclock_timer { boolean_t has_expired; } rtclock_timer_t; -typedef struct rtc_nanotime { - uint64_t tsc_base; /* timestamp */ - uint64_t ns_base; /* nanoseconds */ - uint32_t scale; /* tsc -> nanosec multiplier */ - uint32_t shift; /* tsc -> nanosec shift/div */ - /* shift is overloaded with - * lower 32bits of tsc_freq - * on slower machines (SLOW_TSC_THRESHOLD) */ - uint32_t generation; /* 0 == being updated */ - uint32_t spare1; -} rtc_nanotime_t; - -#define SLOW_TSC_THRESHOLD 1000067800 /* TSC is too slow for regular nanotime() algorithm */ - typedef struct { struct i386_tss *cdi_ktss; @@ -181,7 +167,7 @@ typedef struct cpu_data uint64_t *cpu_physwindow_ptep; void *cpu_hi_iss; boolean_t cpu_tlb_invalid; - uint32_t cpu_hwIntCnt[256]; /* Interrupt counts */ + uint32_t cpu_hwIntCnt[256]; /* Interrupt counts */ uint64_t cpu_dr7; /* debug control register */ uint64_t cpu_int_event_time; /* intr entry/exit time */ vmx_cpu_t cpu_vmx; /* wonderful world of virtualization */ @@ -195,7 +181,7 @@ typedef struct cpu_data * arg store * validity flag. */ - + rtc_nanotime_t *cpu_nanotime; /* Nanotime info */ } cpu_data_t; diff --git a/osfmk/i386/cpu_threads.c b/osfmk/i386/cpu_threads.c index b7f108ecf..7727eb7ea 100644 --- a/osfmk/i386/cpu_threads.c +++ b/osfmk/i386/cpu_threads.c @@ -35,27 +35,41 @@ #include #include +//#define TOPO_DEBUG 1 +#if TOPO_DEBUG +void debug_topology_print(void); +#define DBG(x...) kprintf("DBG: " x) +#else +#define DBG(x...) +#endif /* TOPO_DEBUG */ + #define bitmask(h,l) ((bit(h)|(bit(h)-1)) & ~(bit(l)-1)) #define bitfield(x,h,l) (((x) & bitmask(h,l)) >> l) -/* - * Kernel parameter determining whether threads are halted unconditionally - * in the idle state. This is the default behavior. - * See machine_idle() for use. - */ -int idlehalt = 1; - -x86_pkg_t *x86_pkgs = NULL; -uint32_t num_packages = 0; +x86_pkg_t *x86_pkgs = NULL; uint32_t num_Lx_caches[MAX_CACHE_DEPTH] = { 0 }; static x86_pkg_t *free_pkgs = NULL; +static x86_die_t *free_dies = NULL; static x86_core_t *free_cores = NULL; +static uint32_t num_dies = 0; static x86_cpu_cache_t *x86_caches = NULL; static uint32_t num_caches = 0; +static boolean_t topoParmsInited = FALSE; +x86_topology_parameters_t topoParms; + decl_simple_lock_data(, x86_topo_lock); + +static boolean_t +cpu_is_hyperthreaded(void) +{ + i386_cpu_info_t *cpuinfo; + + cpuinfo = cpuid_info(); + return(cpuinfo->thread_count > cpuinfo->core_count); +} static x86_cpu_cache_t * x86_cache_alloc(void) @@ -84,6 +98,167 @@ x86_cache_alloc(void) return(cache); } + +static void +x86_LLC_info(void) +{ + uint32_t index; + uint32_t cache_info[4]; + uint32_t cache_level = 0; + uint32_t nCPUsSharing = 1; + i386_cpu_info_t *cpuinfo; + + cpuinfo = cpuid_info(); + + do_cpuid(0, cache_info); + + if (cache_info[eax] < 4) { + /* + * Processor does not support deterministic + * cache information. Set LLC sharing to 1, since + * we have no better information. + */ + if (cpu_is_hyperthreaded()) { + topoParms.nCoresSharingLLC = 1; + topoParms.nLCPUsSharingLLC = 2; + topoParms.maxSharingLLC = 2; + } else { + topoParms.nCoresSharingLLC = 1; + topoParms.nLCPUsSharingLLC = 1; + topoParms.maxSharingLLC = 1; + } + return; + } + + for (index = 0; ; index += 1) { + uint32_t this_level; + + cache_info[eax] = 4; + cache_info[ecx] = index; + cache_info[ebx] = 0; + cache_info[edx] = 0; + + cpuid(cache_info); + + /* + * See if all levels have been queried. + */ + if (bitfield(cache_info[eax], 4, 0) == 0) + break; + + /* + * Get the current level. + */ + this_level = bitfield(cache_info[eax], 7, 5); + + /* + * Only worry about it if it's a deeper level than + * what we've seen before. + */ + if (this_level > cache_level) { + cache_level = this_level; + + /* + * Save the number of CPUs sharing this cache. + */ + nCPUsSharing = bitfield(cache_info[eax], 25, 14) + 1; + } + } + + /* + * Make the level of the LLC be 0 based. + */ + topoParms.LLCDepth = cache_level - 1; + + /* + * nCPUsSharing represents the *maximum* number of cores or + * logical CPUs sharing the cache. + */ + topoParms.maxSharingLLC = nCPUsSharing; + + topoParms.nCoresSharingLLC = nCPUsSharing; + topoParms.nLCPUsSharingLLC = nCPUsSharing; + + /* + * nCPUsSharing may not be the number of *active* cores or + * threads that are sharing the cache. + */ + if (nCPUsSharing > cpuinfo->core_count) + topoParms.nCoresSharingLLC = cpuinfo->core_count; + if (nCPUsSharing > cpuinfo->thread_count) + topoParms.nLCPUsSharingLLC = cpuinfo->thread_count; + + + if (nCPUsSharing > cpuinfo->thread_count) + topoParms.maxSharingLLC = cpuinfo->thread_count; +} + +static void +initTopoParms(void) +{ + i386_cpu_info_t *cpuinfo; + + cpuinfo = cpuid_info(); + + /* + * We need to start with getting the LLC information correct. + */ + x86_LLC_info(); + + /* + * Compute the number of threads (logical CPUs) per core. + */ + topoParms.nLThreadsPerCore = cpuinfo->thread_count / cpuinfo->core_count; + topoParms.nPThreadsPerCore = cpuinfo->cpuid_logical_per_package / cpuinfo->cpuid_cores_per_package; + + /* + * Compute the number of dies per package. + */ + topoParms.nLDiesPerPackage = cpuinfo->core_count / topoParms.nCoresSharingLLC; + topoParms.nPDiesPerPackage = cpuinfo->cpuid_cores_per_package / (topoParms.maxSharingLLC / topoParms.nPThreadsPerCore); + + /* + * Compute the number of cores per die. + */ + topoParms.nLCoresPerDie = topoParms.nCoresSharingLLC; + topoParms.nPCoresPerDie = (topoParms.maxSharingLLC / topoParms.nPThreadsPerCore); + + /* + * Compute the number of threads per die. + */ + topoParms.nLThreadsPerDie = topoParms.nLThreadsPerCore * topoParms.nLCoresPerDie; + topoParms.nPThreadsPerDie = topoParms.nPThreadsPerCore * topoParms.nPCoresPerDie; + + /* + * Compute the number of cores per package. + */ + topoParms.nLCoresPerPackage = topoParms.nLCoresPerDie * topoParms.nLDiesPerPackage; + topoParms.nPCoresPerPackage = topoParms.nPCoresPerDie * topoParms.nPDiesPerPackage; + + /* + * Compute the number of threads per package. + */ + topoParms.nLThreadsPerPackage = topoParms.nLThreadsPerCore * topoParms.nLCoresPerPackage; + topoParms.nPThreadsPerPackage = topoParms.nPThreadsPerCore * topoParms.nPCoresPerPackage; + + DBG("\nLogical Topology Parameters:\n"); + DBG("\tThreads per Core: %d\n", topoParms.nLThreadsPerCore); + DBG("\tCores per Die: %d\n", topoParms.nLCoresPerDie); + DBG("\tThreads per Die: %d\n", topoParms.nLThreadsPerDie); + DBG("\tDies per Package: %d\n", topoParms.nLDiesPerPackage); + DBG("\tCores per Package: %d\n", topoParms.nLCoresPerPackage); + DBG("\tThreads per Package: %d\n", topoParms.nLThreadsPerPackage); + + DBG("\nPhysical Topology Parameters:\n"); + DBG("\tThreads per Core: %d\n", topoParms.nPThreadsPerCore); + DBG("\tCores per Die: %d\n", topoParms.nPCoresPerDie); + DBG("\tThreads per Die: %d\n", topoParms.nPThreadsPerDie); + DBG("\tDies per Package: %d\n", topoParms.nPDiesPerPackage); + DBG("\tCores per Package: %d\n", topoParms.nPCoresPerPackage); + DBG("\tThreads per Package: %d\n", topoParms.nPThreadsPerPackage); + + topoParmsInited = TRUE; +} static void x86_cache_free(x86_cpu_cache_t *cache) @@ -141,7 +316,7 @@ x86_cache_list(void) cur->type = bitfield(cache_info[eax], 4, 0); cur->level = bitfield(cache_info[eax], 7, 5); - cur->nlcpus = bitfield(cache_info[eax], 25, 14) + 1; + cur->maxcpus = (bitfield(cache_info[eax], 25, 14) + 1); cur->line_size = bitfield(cache_info[ebx], 11, 0) + 1; cur->partitions = bitfield(cache_info[ebx], 21, 12) + 1; cur->ways = bitfield(cache_info[ebx], 31, 22) + 1; @@ -156,20 +331,33 @@ x86_cache_list(void) last = cur; } + cur->nlcpus = 0; num_Lx_caches[cur->level - 1] += 1; } return(root); } -static boolean_t -cpu_is_hyperthreaded(void) +static x86_cpu_cache_t * +x86_match_cache(x86_cpu_cache_t *list, x86_cpu_cache_t *matcher) { - if (cpuid_features() & CPUID_FEATURE_HTT) - return (cpuid_info()->cpuid_logical_per_package / - cpuid_info()->cpuid_cores_per_package) > 1; - else - return FALSE; + x86_cpu_cache_t *cur_cache; + + cur_cache = list; + while (cur_cache != NULL) { + if (cur_cache->maxcpus == matcher->maxcpus + && cur_cache->type == matcher->type + && cur_cache->level == matcher->level + && cur_cache->ways == matcher->ways + && cur_cache->partitions == matcher->partitions + && cur_cache->line_size == matcher->line_size + && cur_cache->cache_size == matcher->cache_size) + break; + + cur_cache = cur_cache->next; + } + + return(cur_cache); } static void @@ -184,17 +372,21 @@ x86_lcpu_init(int cpu) lcpu = &cpup->lcpu; lcpu->lcpu = lcpu; lcpu->cpu = cpup; - lcpu->next = NULL; - lcpu->core = NULL; + lcpu->next_in_core = NULL; + lcpu->next_in_die = NULL; + lcpu->next_in_pkg = NULL; + lcpu->core = NULL; + lcpu->die = NULL; + lcpu->package = NULL; + lcpu->cpu_num = cpu; lcpu->lnum = cpu; lcpu->pnum = cpup->cpu_phys_number; - lcpu->halted = FALSE; /* XXX is this correct? */ - lcpu->idle = FALSE; /* XXX is this correct? */ + lcpu->state = LCPU_OFF; for (i = 0; i < MAX_CACHE_DEPTH; i += 1) lcpu->caches[i] = NULL; - lcpu->master = (lcpu->pnum == (unsigned int) master_cpu); - lcpu->primary = (lcpu->pnum % cpuid_info()->cpuid_logical_per_package) == 0; + lcpu->master = (lcpu->cpu_num == (unsigned int) master_cpu); + lcpu->primary = (lcpu->pnum % topoParms.nPThreadsPerPackage) == 0; } static x86_core_t * @@ -202,16 +394,14 @@ x86_core_alloc(int cpu) { x86_core_t *core; cpu_data_t *cpup; - uint32_t cpu_in_pkg; - uint32_t lcpus_per_core; cpup = cpu_datap(cpu); simple_lock(&x86_topo_lock); if (free_cores != NULL) { core = free_cores; - free_cores = core->next; - core->next = NULL; + free_cores = core->next_in_die; + core->next_in_die = NULL; simple_unlock(&x86_topo_lock); } else { simple_unlock(&x86_topo_lock); @@ -222,12 +412,8 @@ x86_core_alloc(int cpu) bzero((void *) core, sizeof(x86_core_t)); - cpu_in_pkg = cpu % cpuid_info()->cpuid_logical_per_package; - lcpus_per_core = cpuid_info()->cpuid_logical_per_package / - cpuid_info()->cpuid_cores_per_package; - - core->pcore_num = cpup->cpu_phys_number / lcpus_per_core; - core->lcore_num = core->pcore_num % cpuid_info()->cpuid_cores_per_package; + core->pcore_num = cpup->cpu_phys_number / topoParms.nPThreadsPerCore; + core->lcore_num = core->pcore_num % topoParms.nPCoresPerPackage; core->flags = X86CORE_FL_PRESENT | X86CORE_FL_READY | X86CORE_FL_HALTED | X86CORE_FL_IDLE; @@ -239,7 +425,7 @@ static void x86_core_free(x86_core_t *core) { simple_lock(&x86_topo_lock); - core->next = free_cores; + core->next_in_die = free_cores; free_cores = core; simple_unlock(&x86_topo_lock); } @@ -253,7 +439,7 @@ x86_package_find(int cpu) cpup = cpu_datap(cpu); - pkg_num = cpup->cpu_phys_number / cpuid_info()->cpuid_logical_per_package; + pkg_num = cpup->cpu_phys_number / topoParms.nPThreadsPerPackage; pkg = x86_pkgs; while (pkg != NULL) { @@ -264,48 +450,203 @@ x86_package_find(int cpu) return(pkg); } + +static x86_die_t * +x86_die_find(int cpu) +{ + x86_die_t *die; + x86_pkg_t *pkg; + cpu_data_t *cpup; + uint32_t die_num; + + cpup = cpu_datap(cpu); + + die_num = cpup->cpu_phys_number / topoParms.nPThreadsPerDie; + + pkg = x86_package_find(cpu); + if (pkg == NULL) + return(NULL); + + die = pkg->dies; + while (die != NULL) { + if (die->pdie_num == die_num) + break; + die = die->next_in_pkg; + } + + return(die); +} static x86_core_t * x86_core_find(int cpu) { x86_core_t *core; - x86_pkg_t *pkg; + x86_die_t *die; cpu_data_t *cpup; uint32_t core_num; cpup = cpu_datap(cpu); - core_num = cpup->cpu_phys_number - / (cpuid_info()->cpuid_logical_per_package - / cpuid_info()->cpuid_cores_per_package); + core_num = cpup->cpu_phys_number / topoParms.nPThreadsPerCore; - pkg = x86_package_find(cpu); - if (pkg == NULL) + die = x86_die_find(cpu); + if (die == NULL) return(NULL); - core = pkg->cores; + core = die->cores; while (core != NULL) { if (core->pcore_num == core_num) break; - core = core->next; + core = core->next_in_die; } return(core); } + +void +x86_set_lcpu_numbers(x86_lcpu_t *lcpu) +{ + lcpu->lnum = lcpu->cpu_num % topoParms.nLThreadsPerCore; +} + +void +x86_set_core_numbers(x86_core_t *core, x86_lcpu_t *lcpu) +{ + core->pcore_num = lcpu->cpu_num / topoParms.nLThreadsPerCore; + core->lcore_num = core->pcore_num % topoParms.nLCoresPerDie; +} + +void +x86_set_die_numbers(x86_die_t *die, x86_lcpu_t *lcpu) +{ + die->pdie_num = lcpu->cpu_num / (topoParms.nLThreadsPerCore * topoParms.nLCoresPerDie); + die->ldie_num = die->pdie_num % topoParms.nLDiesPerPackage; +} + +void +x86_set_pkg_numbers(x86_pkg_t *pkg, x86_lcpu_t *lcpu) +{ + pkg->ppkg_num = lcpu->cpu_num / topoParms.nLThreadsPerPackage; + pkg->lpkg_num = pkg->ppkg_num; +} + +static x86_die_t * +x86_die_alloc(int cpu) +{ + x86_die_t *die; + cpu_data_t *cpup; + + cpup = cpu_datap(cpu); + + simple_lock(&x86_topo_lock); + if (free_dies != NULL) { + die = free_dies; + free_dies = die->next_in_pkg; + die->next_in_pkg = NULL; + simple_unlock(&x86_topo_lock); + } else { + simple_unlock(&x86_topo_lock); + die = kalloc(sizeof(x86_die_t)); + if (die == NULL) + panic("x86_die_alloc() kalloc of x86_die_t failed!\n"); + } + + bzero((void *) die, sizeof(x86_die_t)); + + die->pdie_num = cpup->cpu_phys_number / topoParms.nPThreadsPerDie; + + die->ldie_num = num_dies; + atomic_incl((long *) &num_dies, 1); + + die->flags = X86DIE_FL_PRESENT; + return(die); +} static void -x86_core_add_lcpu(x86_core_t *core, x86_lcpu_t *lcpu) +x86_die_free(x86_die_t *die) +{ + simple_lock(&x86_topo_lock); + die->next_in_pkg = free_dies; + free_dies = die; + atomic_decl((long *) &num_dies, 1); + simple_unlock(&x86_topo_lock); +} + +static x86_pkg_t * +x86_package_alloc(int cpu) +{ + x86_pkg_t *pkg; + cpu_data_t *cpup; + + cpup = cpu_datap(cpu); + + simple_lock(&x86_topo_lock); + if (free_pkgs != NULL) { + pkg = free_pkgs; + free_pkgs = pkg->next; + pkg->next = NULL; + simple_unlock(&x86_topo_lock); + } else { + simple_unlock(&x86_topo_lock); + pkg = kalloc(sizeof(x86_pkg_t)); + if (pkg == NULL) + panic("x86_package_alloc() kalloc of x86_pkg_t failed!\n"); + } + + bzero((void *) pkg, sizeof(x86_pkg_t)); + + pkg->ppkg_num = cpup->cpu_phys_number / topoParms.nPThreadsPerPackage; + + pkg->lpkg_num = topoParms.nPackages; + atomic_incl((long *) &topoParms.nPackages, 1); + + pkg->flags = X86PKG_FL_PRESENT | X86PKG_FL_READY; + return(pkg); +} + +static void +x86_package_free(x86_pkg_t *pkg) +{ + simple_lock(&x86_topo_lock); + pkg->next = free_pkgs; + free_pkgs = pkg; + atomic_decl((long *) &topoParms.nPackages, 1); + simple_unlock(&x86_topo_lock); +} + +static void +x86_cache_add_lcpu(x86_cpu_cache_t *cache, x86_lcpu_t *lcpu) +{ + x86_cpu_cache_t *cur_cache; + int i; + + /* + * Put the new CPU into the list of the cache. + */ + cur_cache = lcpu->caches[cache->level - 1]; + lcpu->caches[cache->level - 1] = cache; + cache->next = cur_cache; + cache->nlcpus += 1; + for (i = 0; i < cache->nlcpus; i += 1) { + if (cache->cpus[i] == NULL) { + cache->cpus[i] = lcpu; + break; + } + } +} + +static void +x86_lcpu_add_caches(x86_lcpu_t *lcpu) { x86_cpu_cache_t *list; x86_cpu_cache_t *cur; - x86_core_t *cur_core; + x86_cpu_cache_t *match; + x86_die_t *die; + x86_core_t *core; x86_lcpu_t *cur_lcpu; - boolean_t found; - int level; - int i; - uint32_t cpu_mask; + uint32_t level; + boolean_t found = FALSE; - assert(core != NULL); assert(lcpu != NULL); /* @@ -328,8 +669,9 @@ x86_core_add_lcpu(x86_core_t *core, x86_lcpu_t *lcpu) * If the cache isn't shared then just put it where it * belongs. */ - if (cur->nlcpus == 1) { - goto found_first; + if (cur->maxcpus == 1) { + x86_cache_add_lcpu(cur, lcpu); + continue; } /* @@ -345,101 +687,131 @@ x86_core_add_lcpu(x86_core_t *core, x86_lcpu_t *lcpu) /* * This is a shared cache, so we have to figure out if * this is the first time we've seen this cache. We do - * this by searching through the package and seeing if - * a related core is already describing this cache. + * this by searching through the topology and seeing if + * this cache is already described. * - * NOTE: This assumes that CPUs whose ID mod <# sharing cache> - * are indeed sharing the cache. + * Assume that L{LLC-1} are all at the core level and that + * LLC is shared at the die level. */ - cpu_mask = lcpu->pnum & ~(cur->nlcpus - 1); - cur_core = core->package->cores; - found = FALSE; - - while (cur_core != NULL && !found) { - cur_lcpu = cur_core->lcpus; - while (cur_lcpu != NULL && !found) { - if ((cur_lcpu->pnum & ~(cur->nlcpus - 1)) == cpu_mask) { - lcpu->caches[level] = cur_lcpu->caches[level]; - found = TRUE; - x86_cache_free(cur); + if (level < topoParms.LLCDepth) { + /* + * Shared at the core. + */ + core = lcpu->core; + cur_lcpu = core->lcpus; + while (cur_lcpu != NULL) { + /* + * Skip ourselves. + */ + if (cur_lcpu == lcpu) { + cur_lcpu = cur_lcpu->next_in_core; + continue; + } - /* - * Put the new CPU into the list of the cache. - */ - cur = lcpu->caches[level]; - for (i = 0; i < cur->nlcpus; i += 1) { - if (cur->cpus[i] == NULL) { - cur->cpus[i] = lcpu; - break; - } - } + /* + * If there's a cache on this logical CPU, + * then use that one. + */ + match = x86_match_cache(cur_lcpu->caches[level], cur); + if (match != NULL) { + x86_cache_free(cur); + x86_cache_add_lcpu(match, lcpu); + found = TRUE; + break; } - cur_lcpu = cur_lcpu->next; + + cur_lcpu = cur_lcpu->next_in_core; } + } else { + /* + * Shared at the die. + */ + die = lcpu->die; + cur_lcpu = die->lcpus; + while (cur_lcpu != NULL) { + /* + * Skip ourselves. + */ + if (cur_lcpu == lcpu) { + cur_lcpu = cur_lcpu->next_in_die; + continue; + } - cur_core = cur_core->next; + /* + * If there's a cache on this logical CPU, + * then use that one. + */ + match = x86_match_cache(cur_lcpu->caches[level], cur); + if (match != NULL) { + x86_cache_free(cur); + x86_cache_add_lcpu(match, lcpu); + found = TRUE; + break; + } + + cur_lcpu = cur_lcpu->next_in_die; + } } + /* + * If a shared cache wasn't found, then this logical CPU must + * be the first one encountered. + */ if (!found) { -found_first: - cur->next = lcpu->caches[level]; - lcpu->caches[level] = cur; - cur->cpus[0] = lcpu; + x86_cache_add_lcpu(cur, lcpu); } } - /* - * Add the Logical CPU to the core. - */ - lcpu->next = core->lcpus; - lcpu->core = core; - core->lcpus = lcpu; - core->num_lcpus += 1; - simple_unlock(&x86_topo_lock); } -static x86_pkg_t * -x86_package_alloc(int cpu) +static void +x86_core_add_lcpu(x86_core_t *core, x86_lcpu_t *lcpu) { - x86_pkg_t *pkg; - cpu_data_t *cpup; - - cpup = cpu_datap(cpu); + assert(core != NULL); + assert(lcpu != NULL); simple_lock(&x86_topo_lock); - if (free_pkgs != NULL) { - pkg = free_pkgs; - free_pkgs = pkg->next; - pkg->next = NULL; - simple_unlock(&x86_topo_lock); - } else { - simple_unlock(&x86_topo_lock); - pkg = kalloc(sizeof(x86_pkg_t)); - if (pkg == NULL) - panic("x86_package_alloc() kalloc of x86_pkg_t failed!\n"); - } - bzero((void *) pkg, sizeof(x86_pkg_t)); + lcpu->next_in_core = core->lcpus; + lcpu->core = core; + core->lcpus = lcpu; + core->num_lcpus += 1; + simple_unlock(&x86_topo_lock); +} - pkg->ppkg_num = cpup->cpu_phys_number - / cpuid_info()->cpuid_logical_per_package; +static void +x86_die_add_lcpu(x86_die_t *die, x86_lcpu_t *lcpu) +{ + assert(die != NULL); + assert(lcpu != NULL); + + lcpu->next_in_die = die->lcpus; + lcpu->die = die; + die->lcpus = lcpu; +} - pkg->lpkg_num = num_packages; - atomic_incl((long *) &num_packages, 1); +static void +x86_die_add_core(x86_die_t *die, x86_core_t *core) +{ + assert(die != NULL); + assert(core != NULL); - pkg->flags = X86PKG_FL_PRESENT | X86PKG_FL_READY; - return(pkg); + core->next_in_die = die->cores; + core->die = die; + die->cores = core; + die->num_cores += 1; } -static void -x86_package_free(x86_pkg_t *pkg) + static void +x86_package_add_lcpu(x86_pkg_t *pkg, x86_lcpu_t *lcpu) { - simple_lock(&x86_topo_lock); - pkg->next = free_pkgs; - free_pkgs = pkg; - atomic_decl((long *) &num_packages, 1); - simple_unlock(&x86_topo_lock); + assert(pkg != NULL); + assert(lcpu != NULL); + + lcpu->next_in_pkg = pkg->lcpus; + lcpu->package = pkg; + pkg->lcpus = lcpu; } static void @@ -448,26 +820,56 @@ x86_package_add_core(x86_pkg_t *pkg, x86_core_t *core) assert(pkg != NULL); assert(core != NULL); - core->next = pkg->cores; + core->next_in_pkg = pkg->cores; core->package = pkg; pkg->cores = core; - pkg->num_cores += 1; +} + +static void +x86_package_add_die(x86_pkg_t *pkg, x86_die_t *die) +{ + assert(pkg != NULL); + assert(die != NULL); + + die->next_in_pkg = pkg->dies; + die->package = pkg; + pkg->dies = die; + pkg->num_dies += 1; } void * cpu_thread_alloc(int cpu) { - x86_core_t *core; - x86_pkg_t *pkg; + x86_core_t *core = NULL; + x86_die_t *die = NULL; + x86_pkg_t *pkg = NULL; cpu_data_t *cpup; uint32_t phys_cpu; + /* + * Only allow one to manipulate the topology at a time. + */ + simple_lock(&x86_topo_lock); + + /* + * Make sure all of the topology parameters have been initialized. + */ + if (!topoParmsInited) + initTopoParms(); + cpup = cpu_datap(cpu); phys_cpu = cpup->cpu_phys_number; x86_lcpu_init(cpu); + /* + * Allocate performance counter structure. + */ + simple_unlock(&x86_topo_lock); + cpup->lcpu.pmc = pmc_alloc(); + simple_lock(&x86_topo_lock); + /* * Assume that all cpus have the same features. */ @@ -478,22 +880,9 @@ cpu_thread_alloc(int cpu) } /* - * Only allow one to manipulate the topology at a time. - */ - simple_lock(&x86_topo_lock); - - /* - * Get the core for this logical CPU. + * Get the package that the logical CPU is in. */ - core_again: - core = x86_core_find(cpu); - if (core == NULL) { - /* - * Core structure hasn't been created yet, do it now. - * - * Get the package that the core is part of. - */ - package_again: + do { pkg = x86_package_find(cpu); if (pkg == NULL) { /* @@ -504,7 +893,7 @@ cpu_thread_alloc(int cpu) simple_lock(&x86_topo_lock); if (x86_package_find(cpu) != NULL) { x86_package_free(pkg); - goto package_again; + continue; } /* @@ -513,31 +902,58 @@ cpu_thread_alloc(int cpu) pkg->next = x86_pkgs; x86_pkgs = pkg; } + } while (pkg == NULL); - /* - * Allocate the core structure now. - */ - simple_unlock(&x86_topo_lock); - core = x86_core_alloc(cpu); - simple_lock(&x86_topo_lock); - if (x86_core_find(cpu) != NULL) { - x86_core_free(core); - goto core_again; + /* + * Get the die that the logical CPU is in. + */ + do { + die = x86_die_find(cpu); + if (die == NULL) { + /* + * Die structure hasn't been created yet, do it now. + */ + simple_unlock(&x86_topo_lock); + die = x86_die_alloc(cpu); + simple_lock(&x86_topo_lock); + if (x86_die_find(cpu) != NULL) { + x86_die_free(die); + continue; + } + + /* + * Add the die to the package. + */ + x86_package_add_die(pkg, die); } + } while (die == NULL); - /* - * Add it to the package. - */ - x86_package_add_core(pkg, core); - machine_info.physical_cpu_max += 1; + /* + * Get the core for this logical CPU. + */ + do { + core = x86_core_find(cpu); + if (core == NULL) { + /* + * Allocate the core structure now. + */ + simple_unlock(&x86_topo_lock); + core = x86_core_alloc(cpu); + simple_lock(&x86_topo_lock); + if (x86_core_find(cpu) != NULL) { + x86_core_free(core); + continue; + } + + /* + * Add the core to the die & package. + */ + x86_die_add_core(die, core); + x86_package_add_core(pkg, core); + machine_info.physical_cpu_max += 1; + } + } while (core == NULL); - /* - * Allocate performance counter structure. - */ - simple_unlock(&x86_topo_lock); - core->pmc = pmc_alloc(); - simple_lock(&x86_topo_lock); - } /* * Done manipulating the topology, so others can get in. @@ -545,7 +961,13 @@ cpu_thread_alloc(int cpu) machine_info.logical_cpu_max += 1; simple_unlock(&x86_topo_lock); + /* + * Add the logical CPU to the other topology structures. + */ x86_core_add_lcpu(core, &cpup->lcpu); + x86_die_add_lcpu(core->die, &cpup->lcpu); + x86_package_add_lcpu(core->package, &cpup->lcpu); + x86_lcpu_add_caches(&cpup->lcpu); return (void *) core; } @@ -553,10 +975,10 @@ cpu_thread_alloc(int cpu) void cpu_thread_init(void) { - int my_cpu = get_cpu_number(); - cpu_data_t *cpup = current_cpu_datap(); + int my_cpu = get_cpu_number(); + cpu_data_t *cpup = current_cpu_datap(); x86_core_t *core; - static int initialized = 0; + static int initialized = 0; /* * If we're the boot processor, we do all of the initialization of @@ -582,8 +1004,6 @@ cpu_thread_init(void) if (core->active_lcpus == 0) machine_info.physical_cpu += 1; core->active_lcpus += 1; - cpup->lcpu.halted = FALSE; - cpup->lcpu.idle = FALSE; simple_unlock(&x86_topo_lock); pmCPUMarkRunning(cpup); @@ -602,7 +1022,6 @@ cpu_thread_halt(void) simple_lock(&x86_topo_lock); machine_info.logical_cpu -= 1; - cpup->lcpu.idle = TRUE; core = cpup->lcpu.core; core->active_lcpus -= 1; if (core->active_lcpus == 0) @@ -619,3 +1038,62 @@ cpu_thread_halt(void) } /* NOT REACHED */ } + +#if TOPO_DEBUG +/* + * Prints out the topology + */ +void +debug_topology_print(void) +{ + x86_pkg_t *pkg; + x86_die_t *die; + x86_core_t *core; + x86_lcpu_t *cpu; + + pkg = x86_pkgs; + while (pkg != NULL) { + kprintf("Package:\n"); + kprintf(" Physical: %d\n", pkg->ppkg_num); + kprintf(" Logical: %d\n", pkg->lpkg_num); + + die = pkg->dies; + while (die != NULL) { + kprintf(" Die:\n"); + kprintf(" Physical: %d\n", die->pdie_num); + kprintf(" Logical: %d\n", die->ldie_num); + + core = die->cores; + while (core != NULL) { + kprintf(" Core:\n"); + kprintf(" Physical: %d\n", core->pcore_num); + kprintf(" Logical: %d\n", core->lcore_num); + + cpu = core->lcpus; + while (cpu != NULL) { + kprintf(" LCPU:\n"); + kprintf(" CPU #: %d\n", cpu->cpu_num); + kprintf(" Physical: %d\n", cpu->pnum); + kprintf(" Logical: %d\n", cpu->lnum); + kprintf(" Flags: "); + if (cpu->master) + kprintf("MASTER "); + if (cpu->primary) + kprintf("PRIMARY"); + if (!cpu->master && !cpu->primary) + kprintf("(NONE)"); + kprintf("\n"); + + cpu = cpu->next_in_core; + } + + core = core->next_in_die; + } + + die = die->next_in_pkg; + } + + pkg = pkg->next; + } +} +#endif /* TOPO_DEBUG */ diff --git a/osfmk/i386/cpu_threads.h b/osfmk/i386/cpu_threads.h index 8208cc7ca..dca8b4016 100644 --- a/osfmk/i386/cpu_threads.h +++ b/osfmk/i386/cpu_threads.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,14 +48,17 @@ #define cpu_to_lcpu(cpu) ((cpu_datap(cpu) != NULL) ? _cpu_to_lcpu(cpu) : NULL) #define cpu_to_core(cpu) ((cpu_to_lcpu(cpu) != NULL) ? _cpu_to_lcpu(cpu)->core : NULL) -#define cpu_to_package(cpu) ((cpu_to_core(cpu) != NULL) ? _cpu_to_core(cpu)->package : NULL) +#define cpu_to_die(cpu) ((cpu_to_lcpu(cpu) != NULL) ? _cpu_to_lcpu(cpu)->die : NULL) +#define cpu_to_package(cpu) ((cpu_to_lcpu(cpu) != NULL) ? _cpu_to_lcpu(cpu)->package : NULL) /* Fast access: */ #define x86_lcpu() (¤t_cpu_datap()->lcpu) #define x86_core() (x86_lcpu()->core) -#define x86_package() (x86_core()->package) +#define x86_die() (x86_lcpu()->die) +#define x86_package() (x86_lcpu()->package) #define cpu_is_same_core(cpu1,cpu2) (cpu_to_core(cpu1) == cpu_to_core(cpu2)) +#define cpu_is_same_die(cpu1,cpu2) (cpu_to_die(cpu1) == cpu_to_die(cpu2)) #define cpu_is_same_package(cpu1,cpu2) (cpu_to_package(cpu1) == cpu_to_package(cpu2)) #define cpus_share_cache(cpu1,cpu2,_cl) (cpu_to_lcpu(cpu1)->caches[_cl] == cpu_to_lcpu(cpu2)->caches[_cl]) @@ -65,4 +68,11 @@ extern void *cpu_thread_alloc(int); extern void cpu_thread_init(void); extern void cpu_thread_halt(void); +extern void x86_set_lcpu_numbers(x86_lcpu_t *lcpu); +extern void x86_set_core_numbers(x86_core_t *core, x86_lcpu_t *lcpu); +extern void x86_set_die_numbers(x86_die_t *die, x86_lcpu_t *lcpu); +extern void x86_set_pkg_numbers(x86_pkg_t *pkg, x86_lcpu_t *lcpu); + +extern x86_topology_parameters_t topoParms; + #endif /* _I386_CPU_THREADS_H_ */ diff --git a/osfmk/i386/cpu_topology.c b/osfmk/i386/cpu_topology.c index 56b3b43a9..6e823c980 100644 --- a/osfmk/i386/cpu_topology.c +++ b/osfmk/i386/cpu_topology.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include //#define TOPO_DEBUG 1 #if TOPO_DEBUG @@ -44,6 +44,7 @@ #else #define DBG(x...) #endif +void debug_topology_print(void); __private_extern__ void qsort( void * array, @@ -116,6 +117,13 @@ cpu_topology_start(void) */ for (i = 1; i < ncpus; i++) { cpu_data_t *cpup = cpu_datap(i); + x86_core_t *core = cpup->lcpu.core; + x86_die_t *die = cpup->lcpu.die; + x86_pkg_t *pkg = cpup->lcpu.package; + + assert(core != NULL); + assert(die != NULL); + assert(pkg != NULL); if (cpup->cpu_number != i) { kprintf("cpu_datap(%d):0x%08x local apic id 0x%x " @@ -124,27 +132,37 @@ cpu_topology_start(void) cpup->cpu_number); } cpup->cpu_number = i; - cpup->lcpu.lnum = i; + cpup->lcpu.cpu_num = i; + cpup->lcpu.pnum = cpup->cpu_phys_number; lapic_cpu_map(cpup->cpu_phys_number, i); + x86_set_lcpu_numbers(&cpup->lcpu); + x86_set_core_numbers(core, &cpup->lcpu); + x86_set_die_numbers(die, &cpup->lcpu); + x86_set_pkg_numbers(pkg, &cpup->lcpu); } +#if TOPO_DEBUG + debug_topology_print(); +#endif /* TOPO_DEBUG */ + ml_set_interrupts_enabled(istate); + DBG("cpu_topology_start() LLC is L%d\n", topoParms.LLCDepth + 1); /* * Iterate over all logical cpus finding or creating the affinity set - * for their L2 cache. Each affinity set possesses a processor set + * for their LLC cache. Each affinity set possesses a processor set * into which each logical processor is added. */ DBG("cpu_topology_start() creating affinity sets:\n"); for (i = 0; i < ncpus; i++) { cpu_data_t *cpup = cpu_datap(i); x86_lcpu_t *lcpup = cpu_to_lcpu(i); - x86_cpu_cache_t *L2_cachep; + x86_cpu_cache_t *LLC_cachep; x86_affinity_set_t *aset; - L2_cachep = lcpup->caches[CPU_CACHE_DEPTH_L2]; - assert(L2_cachep->type == CPU_CACHE_TYPE_UNIF); - aset = find_cache_affinity(L2_cachep); + LLC_cachep = lcpup->caches[topoParms.LLCDepth]; + assert(LLC_cachep->type == CPU_CACHE_TYPE_UNIF); + aset = find_cache_affinity(LLC_cachep); if (aset == NULL) { aset = (x86_affinity_set_t *) kalloc(sizeof(*aset)); if (aset == NULL) @@ -152,7 +170,7 @@ cpu_topology_start(void) aset->next = x86_affinities; x86_affinities = aset; aset->num = x86_affinity_count++; - aset->cache = L2_cachep; + aset->cache = LLC_cachep; aset->pset = (i == master_cpu) ? processor_pset(master_processor) : pset_create(pset_node_root()); @@ -163,7 +181,7 @@ cpu_topology_start(void) } DBG("\tprocessor_init set %p(%d) lcpup %p(%d) cpu %p processor %p\n", - aset, aset->num, lcpup, lcpup->lnum, cpup, cpup->cpu_processor); + aset, aset->num, lcpup, lcpup->cpu_num, cpup, cpup->cpu_processor); if (i != master_cpu) processor_init(cpup->cpu_processor, i, aset->pset); @@ -222,8 +240,7 @@ ml_affinity_to_pset(uint32_t affinity_num) if (affinity_num == aset->num) break; } - return (aset == NULL) ? PROCESSOR_SET_NULL : aset->pset; - + return (aset == NULL) ? PROCESSOR_SET_NULL : aset->pset; } uint64_t @@ -233,7 +250,7 @@ ml_cpu_cache_size(unsigned int level) if (level == 0) { return machine_info.max_mem; - } else if ( 1 <= level && level <= 3) { + } else if ( 1 <= level && level <= MAX_CACHE_DEPTH) { cachep = current_cpu_datap()->lcpu.caches[level-1]; return cachep ? cachep->cache_size : 0; } else { @@ -248,7 +265,7 @@ ml_cpu_cache_sharing(unsigned int level) if (level == 0) { return machine_info.max_cpus; - } else if ( 1 <= level && level <= 3) { + } else if ( 1 <= level && level <= MAX_CACHE_DEPTH) { cachep = current_cpu_datap()->lcpu.caches[level-1]; return cachep ? cachep->nlcpus : 0; } else { diff --git a/osfmk/i386/cpu_topology.h b/osfmk/i386/cpu_topology.h index 86729b935..f5cbbefb4 100644 --- a/osfmk/i386/cpu_topology.h +++ b/osfmk/i386/cpu_topology.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,7 @@ typedef struct x86_cpu_cache { struct x86_cpu_cache *next; /* next cache at this level/lcpu */ + struct x86_die *die; /* die containing this cache (only for LLC) */ uint8_t maxcpus; /* maximum # of cpus that can share */ uint8_t nlcpus; /* # of logical cpus sharing this cache */ uint8_t type; /* type of cache */ @@ -68,22 +69,83 @@ typedef struct x86_cpu_cache struct pmc; struct cpu_data; +struct mca_state; +/* + * Define the states that a (logical) CPU can be in. + * + * LCPU_OFF This indicates that the CPU is "off". It requires a full + * restart. This is the state of a CPU when the system first + * boots or when it comes out of "sleep" (aka S3/S5). + * + * LCPU_HALT This indicates that the CPU has been "halted". It has been + * removed from the system but still retains its internal state + * so that it can be quickly brought back on-line. + * + * LCPU_NONSCHED This indicates that the CPU is not schedulable. It + * will still appear in the system as a viable CPU however no + * work will be sceduled on it. + * + * LCPU_PAUSE This indicates that the CPU is "paused". This is usually + * done only during kernel debug. + * + * LCPU_IDLE This indicates that the CPU is idle. The scheduler has + * determined that there is no work for this CPU to do. + * + * LCPU_RUN This indicates that the CPU is running code and performing work. + * + * In normal system operation, CPUs will usually be transitioning between + * LCPU_IDLE and LCPU_RUN. + */ +typedef enum lcpu_state +{ + LCPU_OFF = 0, /* 0 so the right thing happens on boot */ + LCPU_HALT = 1, + LCPU_NONSCHED = 2, + LCPU_PAUSE = 3, + LCPU_IDLE = 4, + LCPU_RUN = 5, +} lcpu_state_t; + +/* + * In each topology structure there are two numbers: a logical number and a + * physical number. + * + * The logical numbers represent the ID of that structure + * relative to the enclosing structure and always starts at 0. So when using + * logical numbers, it is necessary to specify all elements in the topology + * (ie to "name" a logical CPU using logical numbers, 4 numbers are required: + * package, die, core, logical CPU). + * + * The physical numbers represent the ID of that structure and is unique (for + * that structure) across the entire topology. + * + * The logical CPU structure contains a third number which is the CPU number. + * This number is identical to the CPU number used in other parts of the kernel. + */ typedef struct x86_lcpu { - struct x86_lcpu *next; /* next logical cpu in core */ - struct x86_lcpu *lcpu; /* pointer back to self */ - struct x86_core *core; /* core containing the logical cpu */ - struct cpu_data *cpu; /* cpu_data structure */ - uint32_t lnum; /* logical cpu number */ - uint32_t pnum; /* physical cpu number */ - boolean_t master; /* logical cpu is the master (boot) CPU */ - boolean_t primary;/* logical cpu is primary CPU in package */ - boolean_t halted; /* logical cpu is halted */ - boolean_t idle; /* logical cpu is idle */ - uint64_t rtcPop; /* when etimer wants a timer pop */ + struct x86_lcpu *next_in_core; /* next logical cpu in core */ + struct x86_lcpu *next_in_die; /* next logical cpu in die */ + struct x86_lcpu *next_in_pkg; /* next logical cpu in package */ + struct x86_lcpu *lcpu; /* pointer back to self */ + struct x86_core *core; /* core containing the logical cpu */ + struct x86_die *die; /* die containing the logical cpu */ + struct x86_pkg *package; /* package containing the logical cpu */ + struct cpu_data *cpu; /* cpu_data structure */ + uint32_t cpu_num; /* cpu number */ + uint32_t lnum; /* logical cpu number (within core) */ + uint32_t pnum; /* physical cpu number */ + boolean_t master; /* logical cpu is the master (boot) CPU */ + boolean_t primary; /* logical cpu is primary CPU in package */ + volatile lcpu_state_t state; /* state of the logical CPU */ + volatile boolean_t stopped; /* used to indicate that the CPU has "stopped" */ + uint64_t rtcPop; /* when etimer wants a timer pop */ uint64_t rtcDeadline; x86_cpu_cache_t *caches[MAX_CACHE_DEPTH]; + struct pmc *pmc; /* Pointer to perfmon data */ + void *pmStats; /* Power management stats for lcpu */ + void *pmState; /* Power management state for lcpu */ } x86_lcpu_t; #define X86CORE_FL_PRESENT 0x80000000 /* core is present */ @@ -93,25 +155,38 @@ typedef struct x86_lcpu typedef struct x86_core { - struct x86_core *next; /* next core in package */ - struct x86_lcpu *lcpus; /* list of logical cpus in core */ + struct x86_core *next_in_die; /* next core in die */ + struct x86_core *next_in_pkg; /* next core in package */ + struct x86_die *die; /* die containing the core */ struct x86_pkg *package; /* package containing core */ + struct x86_lcpu *lcpus; /* list of logical cpus in core */ uint32_t flags; - uint32_t lcore_num; /* logical core # (unique to package) */ + uint32_t lcore_num; /* logical core # (unique within die) */ uint32_t pcore_num; /* physical core # (globally unique) */ uint32_t num_lcpus; /* Number of logical cpus */ - uint32_t active_lcpus; /* Number of non-halted cpus */ - struct pmc *pmc; /* Pointer to perfmon data */ - struct hpetTimer *Hpet; /* Address of the HPET for this core */ - uint32_t HpetVec; /* Interrupt vector for HPET */ - uint64_t HpetInt; /* Number of HPET Interrupts */ - uint64_t HpetCmp; /* HPET Comparitor */ - uint64_t HpetCfg; /* HPET configuration */ - uint64_t HpetTime; + uint32_t active_lcpus; /* Number of {running, idle} cpus */ void *pmStats; /* Power management stats for core */ void *pmState; /* Power management state for core */ } x86_core_t; +#define X86DIE_FL_PRESENT 0x80000000 /* die is present */ +#define X86DIE_FL_READY 0x40000000 /* die struct is init'd */ + +typedef struct x86_die +{ + struct x86_die *next_in_pkg; /* next die in package */ + struct x86_lcpu *lcpus; /* list of lcpus in die */ + struct x86_core *cores; /* list of cores in die */ + struct x86_pkg *package; /* package containing the die */ + uint32_t flags; + uint32_t ldie_num; /* logical die # (unique to package) */ + uint32_t pdie_num; /* physical die # (globally unique) */ + uint32_t num_cores; /* Number of cores in die */ + x86_cpu_cache_t *LLC; /* LLC contained in this die */ + void *pmStats; /* Power Management stats for die */ + void *pmState; /* Power Management state for die */ +} x86_die_t; + #define X86PKG_FL_PRESENT 0x80000000 /* package is present */ #define X86PKG_FL_READY 0x40000000 /* package struct init'd */ #define X86PKG_FL_HAS_HPET 0x10000000 /* package has HPET assigned */ @@ -121,27 +196,43 @@ typedef struct x86_core typedef struct x86_pkg { struct x86_pkg *next; /* next package */ + struct x86_lcpu *lcpus; /* list of logical cpus in package */ struct x86_core *cores; /* list of cores in package */ + struct x86_die *dies; /* list of dies in package */ uint32_t flags; uint32_t lpkg_num; /* logical package # */ uint32_t ppkg_num; /* physical package # */ - uint32_t num_cores; /* number of cores in package */ - struct hpetTimer *Hpet; /* address of HPET for this package */ - uint32_t HpetVec; /* Interrupt vector for HPET */ - uint64_t HpetInt; /* Number of HPET interrupts */ - uint64_t HpetCmp; /* HPET comparitor */ - uint64_t HpetCfg; /* HPET configuration */ - uint64_t HpetTime; + uint32_t num_dies; /* number of dies in package */ void *pmStats; /* Power Management stats for package*/ void *pmState; /* Power Management state for package*/ + struct mca_state *mca_state; /* MCA state for memory errors */ } x86_pkg_t; extern x86_pkg_t *x86_pkgs; /* root of all CPU packages */ + +typedef struct x86_topology_parameters +{ + uint32_t LLCDepth; + uint32_t nCoresSharingLLC; + uint32_t nLCPUsSharingLLC; + uint32_t maxSharingLLC; + uint32_t nLThreadsPerCore; + uint32_t nPThreadsPerCore; + uint32_t nLCoresPerDie; + uint32_t nPCoresPerDie; + uint32_t nLDiesPerPackage; + uint32_t nPDiesPerPackage; + uint32_t nLThreadsPerDie; + uint32_t nPThreadsPerDie; + uint32_t nLThreadsPerPackage; + uint32_t nPThreadsPerPackage; + uint32_t nLCoresPerPackage; + uint32_t nPCoresPerPackage; + uint32_t nPackages; +} x86_topology_parameters_t; /* Called after cpu discovery */ extern void cpu_topology_start(void); -extern int idlehalt; - #endif /* _I386_CPU_TOPOLOGY_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index a23ed95df..f9c58c5bb 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -213,6 +213,76 @@ cpuid_set_cache_info( i386_cpu_info_t * info_p ) else if (linesizes[L1D]) info_p->cache_linesize = linesizes[L1D]; else panic("no linesize"); + + /* + * Extract and publish TLB information. + */ + for (i = 1; i < sizeof(info_p->cache_info); i++) { + uint8_t desc = info_p->cache_info[i]; + + switch (desc) { + case CPUID_CACHE_ITLB_4K_32_4: + info_p->cpuid_itlb_small = 32; + break; + case CPUID_CACHE_ITLB_4M_2: + info_p->cpuid_itlb_large = 2; + break; + case CPUID_CACHE_DTLB_4K_64_4: + info_p->cpuid_dtlb_small = 64; + break; + case CPUID_CACHE_DTLB_4M_8_4: + info_p->cpuid_dtlb_large = 8; + break; + case CPUID_CACHE_DTLB_4M_32_4: + info_p->cpuid_dtlb_large = 32; + break; + case CPUID_CACHE_ITLB_64: + info_p->cpuid_itlb_small = 64; + info_p->cpuid_itlb_large = 64; + break; + case CPUID_CACHE_ITLB_128: + info_p->cpuid_itlb_small = 128; + info_p->cpuid_itlb_large = 128; + break; + case CPUID_CACHE_ITLB_256: + info_p->cpuid_itlb_small = 256; + info_p->cpuid_itlb_large = 256; + break; + case CPUID_CACHE_DTLB_64: + info_p->cpuid_dtlb_small = 64; + info_p->cpuid_dtlb_large = 64; + break; + case CPUID_CACHE_DTLB_128: + info_p->cpuid_dtlb_small = 128; + info_p->cpuid_dtlb_large = 128; + break; + case CPUID_CACHE_DTLB_256: + info_p->cpuid_dtlb_small = 256; + info_p->cpuid_dtlb_large = 256; + break; + case CPUID_CACHE_ITLB_4M2M_7: + info_p->cpuid_itlb_large = 7; + break; + case CPUID_CACHE_DTLB_4K_16_4: + info_p->cpuid_dtlb_small = 16; + break; + case CPUID_CACHE_DTLB_4M2M_32_4: + info_p->cpuid_dtlb_large = 32; + break; + case CPUID_CACHE_ITLB_4K_128_4: + info_p->cpuid_itlb_small = 128; + break; + case CPUID_CACHE_ITLB_4M_8: + info_p->cpuid_itlb_large = 8; + break; + case CPUID_CACHE_DTLB_4K_128_4: + info_p->cpuid_dtlb_small = 128; + break; + case CPUID_CACHE_DTLB_4K_256_4: + info_p->cpuid_dtlb_small = 256; + break; + } + } } static void @@ -291,7 +361,7 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) /* Fold extensions into family/model */ if (info_p->cpuid_family == 0x0f) info_p->cpuid_family += info_p->cpuid_extfamily; - if (info_p->cpuid_family == 0x0f || info_p->cpuid_family== 0x06) + if (info_p->cpuid_family == 0x0f || info_p->cpuid_family == 0x06) info_p->cpuid_model += (info_p->cpuid_extmodel << 4); if (info_p->cpuid_features & CPUID_FEATURE_HTT) @@ -306,7 +376,7 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) quad(cpuid_reg[ecx], cpuid_reg[edx]); } - if (info_p->cpuid_extfeatures && CPUID_FEATURE_MONITOR) { + if (info_p->cpuid_features & CPUID_FEATURE_MONITOR) { /* * Extract the Monitor/Mwait Leaf info: */ @@ -373,6 +443,13 @@ cpuid_set_info(void) cpuid_set_cache_info(&cpuid_cpu_info); + if (cpuid_cpu_info.core_count == 0) { + cpuid_cpu_info.core_count = + cpuid_cpu_info.cpuid_cores_per_package; + cpuid_cpu_info.thread_count = + cpuid_cpu_info.cpuid_logical_per_package; + } + cpuid_cpu_info.cpuid_model_string = ""; /* deprecated */ } @@ -422,6 +499,7 @@ static struct { {CPUID_FEATURE_PDCM, "PDCM"}, {CPUID_FEATURE_SSE4_1, "SSE4.1"}, {CPUID_FEATURE_SSE4_2, "SSE4.2"}, + {CPUID_FEATURE_xAPIC, "xAPIC"}, {CPUID_FEATURE_POPCNT, "POPCNT"}, {0, 0} }, @@ -436,7 +514,7 @@ extfeature_map[] = { i386_cpu_info_t * cpuid_info(void) { - /* Set-up the cpuid_indo stucture lazily */ + /* Set-up the cpuid_info stucture lazily */ if (cpuid_cpu_infop == NULL) { cpuid_set_info(); cpuid_cpu_infop = &cpuid_cpu_info; @@ -571,12 +649,12 @@ uint64_t cpuid_features(void) { static int checked = 0; - char fpu_arg[16] = { 0 }; + char fpu_arg[20] = { 0 }; (void) cpuid_info(); if (!checked) { /* check for boot-time fpu limitations */ - if (PE_parse_boot_arg("_fpu", &fpu_arg[0])) { + if (PE_parse_boot_argn("_fpu", &fpu_arg[0], sizeof (fpu_arg))) { printf("limiting fpu features to: %s\n", fpu_arg); if (!strncmp("387", fpu_arg, sizeof("387")) || !strncmp("mmx", fpu_arg, sizeof("mmx"))) { printf("no sse or sse2\n"); diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index efc6bb239..34eed7b4d 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -95,8 +95,10 @@ #define CPUID_FEATURE_CX16 _HBit(13) /* CmpXchg16b instruction */ #define CPUID_FEATURE_xTPR _HBit(14) /* Send Task PRiority msgs */ #define CPUID_FEATURE_PDCM _HBit(15) /* Perf/Debug Capability MSR */ +#define CPUID_FEATURE_DCA _HBit(18) /* Direct Cache Access */ #define CPUID_FEATURE_SSE4_1 _HBit(19) /* Streaming SIMD extensions 4.1 */ -#define CPUID_FEATURE_SSE4_2 _HBit(20) /* Streaming SIMD extensions 4.1 */ +#define CPUID_FEATURE_SSE4_2 _HBit(20) /* Streaming SIMD extensions 4.2 */ +#define CPUID_FEATURE_xAPIC _HBit(21) /* Extended APIC Mode */ #define CPUID_FEATURE_POPCNT _HBit(23) /* POPCNT instruction */ /* @@ -112,20 +114,24 @@ #define CPUID_CACHE_SIZE 16 /* Number of descriptor vales */ #define CPUID_CACHE_NULL 0x00 /* NULL */ -#define CPUID_CACHE_ITLB_4K 0x01 /* Instruction TLB: 4K pages */ -#define CPUID_CACHE_ITLB_4M 0x02 /* Instruction TLB: 4M pages */ -#define CPUID_CACHE_DTLB_4K 0x03 /* Data TLB: 4K pages */ -#define CPUID_CACHE_DTLB_4M 0x04 /* Data TLB: 4M pages */ -#define CPUID_CACHE_ICACHE_8K 0x06 /* Instruction cache: 8K */ -#define CPUID_CACHE_ICACHE_16K 0x08 /* Instruction cache: 16K */ -#define CPUID_CACHE_DCACHE_8K 0x0A /* Data cache: 8K */ -#define CPUID_CACHE_DCACHE_16K 0x0C /* Data cache: 16K */ +#define CPUID_CACHE_ITLB_4K_32_4 0x01 /* Inst TLB: 4K pages, 32 ents, 4-way */ +#define CPUID_CACHE_ITLB_4M_2 0x02 /* Inst TLB: 4M pages, 2 ents */ +#define CPUID_CACHE_DTLB_4K_64_4 0x03 /* Data TLB: 4K pages, 64 ents, 4-way */ +#define CPUID_CACHE_DTLB_4M_8_4 0x04 /* Data TLB: 4M pages, 8 ents, 4-way */ +#define CPUID_CACHE_DTLB_4M_32_4 0x05 /* Data TLB: 4M pages, 32 ents, 4-way */ +#define CPUID_CACHE_L1I_8K 0x06 /* Icache: 8K */ +#define CPUID_CACHE_L1I_16K 0x08 /* Icache: 16K */ +#define CPUID_CACHE_L1I_32K 0x09 /* Icache: 32K, 4-way, 64 bytes */ +#define CPUID_CACHE_L1D_8K 0x0A /* Dcache: 8K */ +#define CPUID_CACHE_L1D_16K 0x0C /* Dcache: 16K */ +#define CPUID_CACHE_L1D_16K_4_32 0x0D /* Dcache: 16K, 4-way, 64 byte, ECC */ +#define CPUID_CACHE_L2_256K_8_64 0x21 /* L2: 256K, 8-way, 64 bytes */ #define CPUID_CACHE_L3_512K 0x22 /* L3: 512K */ #define CPUID_CACHE_L3_1M 0x23 /* L3: 1M */ #define CPUID_CACHE_L3_2M 0x25 /* L3: 2M */ #define CPUID_CACHE_L3_4M 0x29 /* L3: 4M */ -#define CPUID_CACHE_DCACHE_32K 0x2C /* Data cache: 32K, 8-way */ -#define CPUID_CACHE_ICACHE_32K 0x30 /* Instruction cache: 32K, 8-way */ +#define CPUID_CACHE_L1D_32K_8 0x2C /* Dcache: 32K, 8-way, 64 byte */ +#define CPUID_CACHE_L1I_32K_8 0x30 /* Icache: 32K, 8-way */ #define CPUID_CACHE_L2_128K_S4 0x39 /* L2: 128K, 4-way, sectored */ #define CPUID_CACHE_L2_128K_S2 0x3B /* L2: 128K, 2-way, sectored */ #define CPUID_CACHE_L2_256K_S4 0x3C /* L2: 256K, 4-way, sectored */ @@ -144,18 +150,20 @@ #define CPUID_CACHE_L2_12M_12_64 0x4C /* L2: 12M, 12-way, 64 bytes */ #define CPUID_CACHE_L2_16M_16_64 0x4D /* L2: 16M, 16-way, 64 bytes */ #define CPUID_CACHE_L2_6M_24_64 0x4E /* L2: 6M, 24-way, 64 bytes */ -#define CPUID_CACHE_ITLB_64 0x50 /* Instruction TLB: 64 entries */ -#define CPUID_CACHE_ITLB_128 0x51 /* Instruction TLB: 128 entries */ -#define CPUID_CACHE_ITLB_256 0x52 /* Instruction TLB: 256 entries */ +#define CPUID_CACHE_ITLB_64 0x50 /* Inst TLB: 64 entries */ +#define CPUID_CACHE_ITLB_128 0x51 /* Inst TLB: 128 entries */ +#define CPUID_CACHE_ITLB_256 0x52 /* Inst TLB: 256 entries */ +#define CPUID_CACHE_ITLB_4M2M_7 0x55 /* Inst TLB: 4M/2M, 7 entries */ #define CPUID_CACHE_DTLB_4M_16_4 0x56 /* Data TLB: 4M, 16 entries, 4-way */ -#define CPUID_CACHE_DTLB_4K_16_4 0x56 /* Data TLB: 4K, 16 entries, 4-way */ +#define CPUID_CACHE_DTLB_4K_16_4 0x57 /* Data TLB: 4K, 16 entries, 4-way */ +#define CPUID_CACHE_DTLB_4M2M_32_4 0x5A /* Data TLB: 4M/2M, 32 entries */ #define CPUID_CACHE_DTLB_64 0x5B /* Data TLB: 64 entries */ #define CPUID_CACHE_DTLB_128 0x5C /* Data TLB: 128 entries */ #define CPUID_CACHE_DTLB_256 0x5D /* Data TLB: 256 entries */ -#define CPUID_CACHE_DCACHE_16K_8_64 0x60 /* Data cache: 16K, 8-way, 64 bytes */ -#define CPUID_CACHE_DCACHE_8K_4_64 0x66 /* Data cache: 8K, 4-way, 64 bytes */ -#define CPUID_CACHE_DCACHE_16K_4_64 0x67 /* Data cache: 16K, 4-way, 64 bytes */ -#define CPUID_CACHE_DCACHE_32K_4_64 0x68 /* Data cache: 32K, 4-way, 64 bytes */ +#define CPUID_CACHE_L1D_16K_8_64 0x60 /* Data cache: 16K, 8-way, 64 bytes */ +#define CPUID_CACHE_L1D_8K_4_64 0x66 /* Data cache: 8K, 4-way, 64 bytes */ +#define CPUID_CACHE_L1D_16K_4_64 0x67 /* Data cache: 16K, 4-way, 64 bytes */ +#define CPUID_CACHE_L1D_32K_4_64 0x68 /* Data cache: 32K, 4-way, 64 bytes */ #define CPUID_CACHE_TRACE_12K_8 0x70 /* Trace cache 12K-uop, 8-way */ #define CPUID_CACHE_TRACE_16K_8 0x71 /* Trace cache 16K-uop, 8-way */ #define CPUID_CACHE_TRACE_32K_8 0x72 /* Trace cache 32K-uop, 8-way */ @@ -174,15 +182,35 @@ #define CPUID_CACHE_L2_1M_8_64 0x87 /* L2: 1M, 8-way, 64 bytes */ #define CPUID_CACHE_ITLB_4K_128_4 0xB0 /* ITLB: 4KB, 128 entries, 4-way */ #define CPUID_CACHE_ITLB_4M_4_4 0xB1 /* ITLB: 4MB, 4 entries, 4-way, or */ -#define CPUID_CACHE_ITLB_2M_8_4 0xB1 /* ITLB: 2MB, 8 entries, 4-way */ +#define CPUID_CACHE_ITLB_2M_8_4 0xB1 /* ITLB: 2MB, 8 entries, 4-way, or */ +#define CPUID_CACHE_ITLB_4M_8 0xB1 /* ITLB: 4MB, 8 entries */ +#define CPUID_CACHE_ITLB_4K_64_4 0xB2 /* ITLB: 4KB, 64 entries, 4-way */ #define CPUID_CACHE_DTLB_4K_128_4 0xB3 /* DTLB: 4KB, 128 entries, 4-way */ #define CPUID_CACHE_DTLB_4K_256_4 0xB4 /* DTLB: 4KB, 256 entries, 4-way */ +#define CPUID_CACHE_2TLB_4K_512_4 0xB4 /* 2nd-level TLB: 4KB, 512, 4-way */ +#define CPUID_CACHE_L3_512K_4_64 0xD0 /* L3: 512KB, 4-way, 64 bytes */ +#define CPUID_CACHE_L3_1M_4_64 0xD1 /* L3: 1M, 4-way, 64 bytes */ +#define CPUID_CACHE_L3_2M_4_64 0xD2 /* L3: 2M, 4-way, 64 bytes */ +#define CPUID_CACHE_L3_1M_8_64 0xD6 /* L3: 1M, 8-way, 64 bytes */ +#define CPUID_CACHE_L3_2M_8_64 0xD7 /* L3: 2M, 8-way, 64 bytes */ +#define CPUID_CACHE_L3_4M_8_64 0xD8 /* L3: 4M, 8-way, 64 bytes */ +#define CPUID_CACHE_L3_1M5_12_64 0xDC /* L3: 1.5M, 12-way, 64 bytes */ +#define CPUID_CACHE_L3_3M_12_64 0xDD /* L3: 3M, 12-way, 64 bytes */ +#define CPUID_CACHE_L3_6M_12_64 0xDE /* L3: 6M, 12-way, 64 bytes */ +#define CPUID_CACHE_L3_2M_16_64 0xE2 /* L3: 2M, 16-way, 64 bytes */ +#define CPUID_CACHE_L3_4M_16_64 0xE3 /* L3: 4M, 16-way, 64 bytes */ +#define CPUID_CACHE_L3_8M_16_64 0xE4 /* L3: 8M, 16-way, 64 bytes */ #define CPUID_CACHE_PREFETCH_64 0xF0 /* 64-Byte Prefetching */ #define CPUID_CACHE_PREFETCH_128 0xF1 /* 128-Byte Prefetching */ #define CPUID_MWAIT_EXTENSION _Bit(0) /* enumeration of WMAIT extensions */ #define CPUID_MWAIT_BREAK _Bit(1) /* interrupts are break events */ +#define CPUID_MODEL_YONAH 14 +#define CPUID_MODEL_MEROM 15 +#define CPUID_MODEL_PENRYN 23 +#define CPUID_MODEL_NEHALEM 26 + #ifndef ASSEMBLER #include #include @@ -297,6 +325,18 @@ typedef struct { /* Virtual and physical address aize: */ uint32_t cpuid_address_bits_physical; uint32_t cpuid_address_bits_virtual; + + uint32_t cpuid_microcode_version; + + /* Numbers of tlbs per processor */ + uint32_t cpuid_itlb_small; + uint32_t cpuid_dtlb_small; + uint32_t cpuid_itlb_large; + uint32_t cpuid_dtlb_large; + + uint32_t core_count; + uint32_t thread_count; + } i386_cpu_info_t; #ifdef __cplusplus diff --git a/osfmk/i386/db_machdep.h b/osfmk/i386/db_machdep.h index fda25c022..99577bb30 100644 --- a/osfmk/i386/db_machdep.h +++ b/osfmk/i386/db_machdep.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,6 @@ #include #include #include -#include typedef addr64_t db_addr_t; /* address - unsigned */ typedef uint64_t db_expr_t; /* expression */ @@ -172,9 +171,6 @@ extern void db_msr(db_expr_t addr, boolean_t have_addr, db_expr_t count, char *modif); extern void db_apic(db_expr_t addr, boolean_t have_addr, db_expr_t count, char *modif); -extern void db_display_hpet(hpetReg_t *); -extern void db_hpet(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif); /* macros for printing OS server dependent task name */ @@ -203,7 +199,6 @@ extern void kdb_on( int cpu); #if MACH_KDB -extern void db_getpmgr(pmData_t *pmj); extern void db_chkpmgr(void); #endif /* MACH_KDB */ extern void db_pmgr(db_expr_t addr, int have_addr, db_expr_t count, char * modif); diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index a723585bb..86b97b4bc 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,7 +82,6 @@ #include #include #include -#include #include #include #include @@ -441,6 +440,9 @@ main( DECLARE("CPU_UBER_ARG_STORE_VALID", offsetof(cpu_data_t *, cpu_uber_arg_store_valid)); + DECLARE("CPU_NANOTIME", + offsetof(cpu_data_t *, cpu_nanotime)); + DECLARE("CPU_DR7", offsetof(cpu_data_t *, cpu_dr7)); @@ -548,21 +550,6 @@ main( DECLARE("OnProc", OnProc); - - DECLARE("GCAP_ID", offsetof(hpetReg_t *, GCAP_ID)); - DECLARE("GEN_CONF", offsetof(hpetReg_t *, GEN_CONF)); - DECLARE("GINTR_STA", offsetof(hpetReg_t *, GINTR_STA)); - DECLARE("MAIN_CNT", offsetof(hpetReg_t *, MAIN_CNT)); - DECLARE("TIM0_CONF", offsetof(hpetReg_t *, TIM0_CONF)); - DECLARE("TIM_CONF", TIM_CONF); - DECLARE("Tn_INT_ENB_CNF", Tn_INT_ENB_CNF); - DECLARE("TIM0_COMP", offsetof(hpetReg_t *, TIM0_COMP)); - DECLARE("TIM_COMP", TIM_COMP); - DECLARE("TIM1_CONF", offsetof(hpetReg_t *, TIM1_CONF)); - DECLARE("TIM1_COMP", offsetof(hpetReg_t *, TIM1_COMP)); - DECLARE("TIM2_CONF", offsetof(hpetReg_t *, TIM2_CONF)); - DECLARE("TIM2_COMP", offsetof(hpetReg_t *, TIM2_COMP)); - #if CONFIG_DTRACE DECLARE("LS_LCK_MTX_LOCK_ACQUIRE", LS_LCK_MTX_LOCK_ACQUIRE); DECLARE("LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE", LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE); diff --git a/osfmk/i386/hpet.c b/osfmk/i386/hpet.c deleted file mode 100644 index 940a7c649..000000000 --- a/osfmk/i386/hpet.c +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if MACH_KDB -#include -#endif -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#endif /* MACH_KDB */ -#include - -/* Decimal powers: */ -#define kilo (1000ULL) -#define Mega (kilo * kilo) -#define Giga (kilo * Mega) -#define Tera (kilo * Giga) -#define Peta (kilo * Tera) - -uint32_t hpetArea = 0; -uint32_t hpetAreap = 0; -uint64_t hpetFemto = 0; -uint64_t hpetFreq = 0; -uint64_t hpetCvt = 0; /* (TAKE OUT LATER) */ -uint64_t hpetCvtt2n = 0; -uint64_t hpetCvtn2t = 0; -uint64_t tsc2hpet = 0; -uint64_t hpet2tsc = 0; -uint64_t bus2hpet = 0; -uint64_t hpet2bus = 0; - -uint32_t rcbaArea = 0; -uint32_t rcbaAreap = 0; - -static int (*hpet_req)(uint32_t apicid, void *arg, hpetRequest_t *hpet) = NULL; -static void *hpet_arg = NULL; - -#if DEBUG -#define DBG(x...) kprintf("DBG: " x) -#else -#define DBG(x...) -#endif - -int -hpet_register_callback(int (*hpet_reqst)(uint32_t apicid, - void *arg, - hpetRequest_t *hpet), - void *arg) -{ - hpet_req = hpet_reqst; - hpet_arg = arg; - return(0); -} - -/* - * This routine is called to obtain an HPET and have it assigned - * to a CPU. It returns 0 if successful and non-zero if one could - * not be assigned. - */ -int -hpet_request(uint32_t cpu) -{ - hpetRequest_t hpetReq; - int rc; - x86_lcpu_t *lcpu; - x86_core_t *core; - x86_pkg_t *pkg; - boolean_t enabled; - - if (hpet_req == NULL) { - return(-1); - } - - /* - * Deal with the case where the CPU # passed in is past the - * value specified in cpus=n in boot-args. - */ - if (cpu >= real_ncpus) { - enabled = ml_set_interrupts_enabled(FALSE); - lcpu = cpu_to_lcpu(cpu); - if (lcpu != NULL) { - core = lcpu->core; - pkg = core->package; - - if (lcpu->primary) { - pkg->flags |= X86PKG_FL_HAS_HPET; - } - } - - ml_set_interrupts_enabled(enabled); - return(0); - } - - rc = (*hpet_req)(ml_get_apicid(cpu), hpet_arg, &hpetReq); - if (rc != 0) { - return(rc); - } - - enabled = ml_set_interrupts_enabled(FALSE); - lcpu = cpu_to_lcpu(cpu); - core = lcpu->core; - pkg = core->package; - - /* - * Compute the address of the HPET. - */ - core->Hpet = (hpetTimer_t *)((uint8_t *)hpetArea + hpetReq.hpetOffset); - core->HpetVec = hpetReq.hpetVector; - - /* - * Enable interrupts - */ - core->Hpet->Config |= Tn_INT_ENB_CNF; - - /* - * Save the configuration - */ - core->HpetCfg = core->Hpet->Config; - core->HpetCmp = 0; - - /* - * If the CPU is the "primary" for the package, then - * add the HPET to the package too. - */ - if (lcpu->primary) { - pkg->Hpet = core->Hpet; - pkg->HpetCfg = core->HpetCfg; - pkg->HpetCmp = core->HpetCmp; - pkg->flags |= X86PKG_FL_HAS_HPET; - } - - ml_set_interrupts_enabled(enabled); - - return(0); -} - -/* - * Map the RCBA area. - */ -static void -map_rcbaArea(void) -{ - /* - * Get RCBA area physical address and map it - */ - outl(cfgAdr, lpcCfg | (0xF0 & 0xFC)); - rcbaAreap = inl(cfgDat | (0xF0 & 0x03)); - rcbaArea = io_map_spec(rcbaAreap & -4096, PAGE_SIZE * 4, VM_WIMG_IO); - kprintf("RCBA: vaddr = %08X, paddr = %08X\n", rcbaArea, rcbaAreap); -} - -/* - * Initialize the HPET - */ -void -hpet_init(void) -{ - unsigned int *xmod; - - map_rcbaArea(); - - /* - * Is the HPET memory already enabled? - * If not, set address and enable. - */ - xmod = (uint32_t *)(rcbaArea + 0x3404); /* Point to the HPTC */ - uint32_t hptc = *xmod; /* Get HPET config */ - DBG(" current RCBA.HPTC: %08X\n", *xmod); - if(!(hptc & hptcAE)) { - DBG("HPET memory is not enabled, " - "enabling and assigning to 0xFED00000 (hope that's ok)\n"); - *xmod = (hptc & ~3) | hptcAE; - } - - /* - * Get physical address of HPET and map it. - */ - hpetAreap = hpetAddr | ((hptc & 3) << 12); - hpetArea = io_map_spec(hpetAreap & -4096, PAGE_SIZE * 4, VM_WIMG_IO); - kprintf("HPET: vaddr = %08X, paddr = %08X\n", hpetArea, hpetAreap); - - /* - * Extract the HPET tick rate. - * The period of the HPET is reported in femtoseconds (10**-15s) - * and convert to frequency in hertz. - */ - hpetFemto = (uint32_t)(((hpetReg_t *)hpetArea)->GCAP_ID >> 32); - hpetFreq = (1 * Peta) / hpetFemto; - - /* - * The conversion factor is the number of nanoseconds per HPET tick - * with about 32 bits of fraction. The value is converted to a - * base-2 fixed point number. To convert from HPET to nanoseconds, - * multiply the value by the conversion factor using 96-bit arithmetic, - * then shift right 32 bits. If the value is known to be small, - * 64-bit arithmetic will work. - */ - - /* - * Begin conversion of base 10 femtoseconds to base 2, calculate: - * - HPET ticks to nanoseconds conversion in base 2 fraction (* 2**32) - * - nanoseconds to HPET ticks conversion - */ - hpetCvtt2n = (uint64_t)hpetFemto << 32; - hpetCvtt2n = hpetCvtt2n / 1000000ULL; - hpetCvtn2t = 0xFFFFFFFFFFFFFFFFULL / hpetCvtt2n; - kprintf("HPET: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X\n", - (uint32_t)(hpetFreq / Mega), (uint32_t)(hpetFreq % Mega), - (uint32_t)(hpetCvtt2n >> 32), (uint32_t)hpetCvtt2n, - (uint32_t)(hpetCvtn2t >> 32), (uint32_t)hpetCvtn2t); - - - /* (TAKE OUT LATER) - * Begin conversion of base 10 femtoseconds to base 2 - * HPET ticks to nanoseconds in base 2 fraction (times 1048576) - */ - hpetCvt = (uint64_t)hpetFemto << 20; - hpetCvt = hpetCvt / 1000000ULL; - - /* Calculate conversion from TSC to HPET */ - tsc2hpet = tmrCvt(tscFCvtt2n, hpetCvtn2t); - DBG(" CVT: TSC to HPET = %08X.%08X\n", - (uint32_t)(tsc2hpet >> 32), (uint32_t)tsc2hpet); - - /* Calculate conversion from HPET to TSC */ - hpet2tsc = tmrCvt(hpetCvtt2n, tscFCvtn2t); - DBG(" CVT: HPET to TSC = %08X.%08X\n", - (uint32_t)(hpet2tsc >> 32), (uint32_t)hpet2tsc); - - /* Calculate conversion from BUS to HPET */ - bus2hpet = tmrCvt(busFCvtt2n, hpetCvtn2t); - DBG(" CVT: BUS to HPET = %08X.%08X\n", - (uint32_t)(bus2hpet >> 32), (uint32_t)bus2hpet); - - /* Calculate conversion from HPET to BUS */ - hpet2bus = tmrCvt(hpetCvtt2n, busFCvtn2t); - DBG(" CVT: HPET to BUS = %08X.%08X\n", - (uint32_t)(hpet2bus >> 32), (uint32_t)hpet2bus); - -#if MACH_KDB - db_display_hpet((hpetReg_t *)hpetArea); /* (BRINGUP) */ -#endif -} - -/* - * This routine is used to get various information about the HPET - * without having to export gobs of globals. It fills in a data - * structure with the info. - */ -void -hpet_get_info(hpetInfo_t *info) -{ - info->hpetCvtt2n = hpetCvtt2n; - info->hpetCvtn2t = hpetCvtn2t; - info->tsc2hpet = tsc2hpet; - info->hpet2tsc = hpet2tsc; - info->bus2hpet = bus2hpet; - info->hpet2bus = hpet2bus; - /* - * XXX - * We're repurposing the rcbaArea so we can use the HPET. - * Eventually we'll rename this correctly. - */ - info->rcbaArea = hpetArea; - info->rcbaAreap = hpetAreap; -} - - -/* - * This routine is called by the HPET driver - * when it assigns an HPET timer to a processor. - * - * XXX with the new callback into the HPET driver, - * this routine will be deprecated. - */ -void -ml_hpet_cfg(uint32_t cpu, uint32_t hpetVect) -{ - uint64_t *hpetVaddr; - hpetTimer_t *hpet; - x86_lcpu_t *lcpu; - x86_core_t *core; - x86_pkg_t *pkg; - boolean_t enabled; - - if(cpu > 1) { - panic("ml_hpet_cfg: invalid cpu = %d\n", cpu); - } - - lcpu = cpu_to_lcpu(cpu); - core = lcpu->core; - pkg = core->package; - - /* - * Only deal with the primary CPU for the package. - */ - if (!lcpu->primary) - return; - - enabled = ml_set_interrupts_enabled(FALSE); - - /* Calculate address of the HPET for this processor */ - hpetVaddr = (uint64_t *)(((uint32_t)&(((hpetReg_t *)hpetArea)->TIM1_CONF)) + (cpu << 5)); - hpet = (hpetTimer_t *)hpetVaddr; - - DBG("ml_hpet_cfg: HPET for cpu %d at %p, vector = %d\n", - cpu, hpetVaddr, hpetVect); - - /* Save the address and vector of the HPET for this processor */ - core->Hpet = hpet; - core->HpetVec = hpetVect; - - /* - * Enable interrupts - */ - core->Hpet->Config |= Tn_INT_ENB_CNF; - - /* Save the configuration */ - core->HpetCfg = core->Hpet->Config; - core->HpetCmp = 0; - - /* - * We're only doing this for the primary CPU, so go - * ahead and add the HPET to the package too. - */ - pkg->Hpet = core->Hpet; - pkg->HpetVec = core->HpetVec; - pkg->HpetCfg = core->HpetCfg; - pkg->HpetCmp = core->HpetCmp; - pkg->flags |= X86PKG_FL_HAS_HPET; - - ml_set_interrupts_enabled(enabled); -} - -/* - * This is the HPET interrupt handler. - * - * It just hands off to the power management code so that the - * appropriate things get done there. - */ -int -HPETInterrupt(void) -{ - - /* All we do here is to bump the count */ - x86_package()->HpetInt++; - - /* - * Let power management do it's thing. - */ - pmHPETInterrupt(); - - /* Return and show that the 'rupt has been handled... */ - return 1; -} - - -static hpetReg_t saved_hpet; - -void -hpet_save(void) -{ - hpetReg_t *from = (hpetReg_t *) hpetArea; - hpetReg_t *to = &saved_hpet; - - to->GEN_CONF = from->GEN_CONF; - to->TIM0_CONF = from->TIM0_CONF; - to->TIM0_COMP = from->TIM0_COMP; - to->TIM1_CONF = from->TIM1_CONF; - to->TIM1_COMP = from->TIM1_COMP; - to->TIM2_CONF = from->TIM2_CONF; - to->TIM2_COMP = from->TIM2_COMP; - to->MAIN_CNT = from->MAIN_CNT; -} - -void -hpet_restore(void) -{ - hpetReg_t *from = &saved_hpet; - hpetReg_t *to = (hpetReg_t *) hpetArea; - - /* - * Is the HPET memory already enabled? - * If not, set address and enable. - */ - uint32_t *hptcp = (uint32_t *)(rcbaArea + 0x3404); - uint32_t hptc = *hptcp; - if(!(hptc & hptcAE)) { - DBG("HPET memory is not enabled, " - "enabling and assigning to 0xFED00000 (hope that's ok)\n"); - *hptcp = (hptc & ~3) | hptcAE; - } - - to->GEN_CONF = from->GEN_CONF & ~1; - - to->TIM0_CONF = from->TIM0_CONF; - to->TIM0_COMP = from->TIM0_COMP; - to->TIM1_CONF = from->TIM1_CONF; - to->TIM1_COMP = from->TIM1_COMP; - to->TIM2_CONF = from->TIM2_CONF; - to->TIM2_COMP = from->TIM2_COMP; - to->GINTR_STA = -1ULL; - to->MAIN_CNT = from->MAIN_CNT; - - to->GEN_CONF = from->GEN_CONF; -} - -/* - * Read the HPET timer - * - */ -uint64_t -rdHPET(void) -{ - hpetReg_t *hpetp = (hpetReg_t *) hpetArea; - volatile uint32_t *regp = (uint32_t *) &hpetp->MAIN_CNT; - uint32_t high; - uint32_t low; - - do { - high = *(regp + 1); - low = *regp; - } while (high != *(regp + 1)); - - return (((uint64_t) high) << 32) | low; -} - -#if MACH_KDB - -#define HI32(x) ((uint32_t)(((x) >> 32) & 0xFFFFFFFF)) -#define LO32(x) ((uint32_t)((x) & 0xFFFFFFFF)) - -/* - * Displays HPET memory mapped area - * hp - */ -void -db_hpet(__unused db_expr_t addr, __unused int have_addr, __unused db_expr_t count, __unused char *modif) -{ - - db_display_hpet((hpetReg_t *) hpetArea); /* Dump out the HPET - * stuff */ - return; -} - -void -db_display_hpet(hpetReg_t *hpt) -{ - uint64_t cmain; - - cmain = hpt->MAIN_CNT; /* Get the main timer */ - - /* General capabilities */ - db_printf(" GCAP_ID = %08X.%08X\n", - HI32(hpt->GCAP_ID), LO32(hpt->GCAP_ID)); - /* General configuration */ - db_printf(" GEN_CONF = %08X.%08X\n", - HI32(hpt->GEN_CONF), LO32(hpt->GEN_CONF)); - /* General Interrupt status */ - db_printf("GINTR_STA = %08X.%08X\n", - HI32(hpt->GINTR_STA), LO32(hpt->GINTR_STA)); - /* Main counter */ - db_printf(" MAIN_CNT = %08X.%08X\n", - HI32(cmain), LO32(cmain)); - /* Timer 0 config and cap */ - db_printf("TIM0_CONF = %08X.%08X\n", - HI32(hpt->TIM0_CONF), LO32(hpt->TIM0_CONF)); - /* Timer 0 comparator */ - db_printf("TIM0_COMP = %08X.%08X\n", - HI32(hpt->TIM0_COMP), LO32(hpt->TIM0_COMP)); - /* Timer 1 config and cap */ - db_printf("TIM0_CONF = %08X.%08X\n", - HI32(hpt->TIM1_CONF), LO32(hpt->TIM1_CONF)); - /* Timer 1 comparator */ - db_printf("TIM1_COMP = %08X.%08X\n", - HI32(hpt->TIM1_COMP), LO32(hpt->TIM1_COMP)); - /* Timer 2 config and cap */ - db_printf("TIM2_CONF = %08X.%08X\n", - HI32(hpt->TIM2_CONF), LO32(hpt->TIM2_CONF)); - /* Timer 2 comparator */ - db_printf("TIM2_COMP = %08X.%08X\n", - HI32(hpt->TIM2_COMP), LO32(hpt->TIM2_COMP)); - - db_printf("\nHPET Frequency = %d.%05dMHz\n", - (uint32_t) (hpetFreq / 1000000), (uint32_t) (hpetFreq % 1000000)); -} -#endif diff --git a/osfmk/i386/hpet.h b/osfmk/i386/hpet.h deleted file mode 100644 index 72656d06a..000000000 --- a/osfmk/i386/hpet.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifdef KERNEL_PRIVATE -#ifndef _I386_HPET_H_ -#define _I386_HPET_H_ - -/* - * HPET kernel functions to support the HPET KEXT and the - * power management KEXT. - */ - - -/* - * Memory mapped registers for the HPET - */ -typedef struct hpetReg { - uint64_t GCAP_ID; /* General capabilities */ - uint64_t rsv1; - uint64_t GEN_CONF; /* General configuration */ - uint64_t rsv2; - uint64_t GINTR_STA; /* General Interrupt status */ - uint64_t rsv3[25]; - uint64_t MAIN_CNT; /* Main counter */ - uint64_t rsv4; - uint64_t TIM0_CONF; /* Timer 0 config and cap */ -#define TIM_CONF 0 -#define Tn_INT_ENB_CNF 4 - uint64_t TIM0_COMP; /* Timer 0 comparator */ -#define TIM_COMP 8 - uint64_t rsv5[2]; - uint64_t TIM1_CONF; /* Timer 1 config and cap */ - uint64_t TIM1_COMP; /* Timer 1 comparator */ - uint64_t rsv6[2]; - uint64_t TIM2_CONF; /* Timer 2 config and cap */ - uint64_t TIM2_COMP; /* Timer 2 comparator */ - uint64_t rsv7[2]; -} hpetReg; -typedef struct hpetReg hpetReg_t; - -typedef struct hpetTimer { - uint64_t Config; /* Timer config and capabilities */ - uint64_t Compare; /* Timer comparitor */ -} hpetTimer_t; - -struct hpetInfo -{ - uint64_t hpetCvtt2n; - uint64_t hpetCvtn2t; - uint64_t tsc2hpet; - uint64_t hpet2tsc; - uint64_t bus2hpet; - uint64_t hpet2bus; - uint32_t rcbaArea; - uint32_t rcbaAreap; -}; -typedef struct hpetInfo hpetInfo_t; - -struct hpetRequest -{ - uint32_t flags; - uint32_t hpetOffset; - uint32_t hpetVector; -}; -typedef struct hpetRequest hpetRequest_t; - -#define HPET_REQFL_64BIT 0x00000001 /* Timer is 64 bits */ - -extern uint64_t hpetFemto; -extern uint64_t hpetFreq; -extern uint64_t hpetCvtt2n; -extern uint64_t hpetCvtn2t; -extern uint64_t tsc2hpet; -extern uint64_t hpet2tsc; -extern uint64_t bus2hpet; -extern uint64_t hpet2bus; - -extern uint32_t rcbaArea; -extern uint32_t rcbaAreap; - -extern void map_rcbaAread(void); -extern void hpet_init(void); - -extern void hpet_save(void); -extern void hpet_restore(void); - -#ifdef XNU_KERNEL_PRIVATE -extern int HPETInterrupt(void); -#endif - -extern int hpet_register_callback(int (*hpet_reqst)(uint32_t apicid, void *arg, hpetRequest_t *hpet), void *arg); -extern int hpet_request(uint32_t cpu); - -extern uint64_t rdHPET(void); -extern void hpet_get_info(hpetInfo_t *info); - -#define hpetAddr 0xFED00000 -#define hptcAE 0x80 - -#endif /* _I386_HPET_H_ */ - -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 67f0f2803..8005189dd 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,7 +91,6 @@ #include #include #include -#include #include /* LcksOpts */ #if MACH_KDB #include @@ -153,11 +152,11 @@ i386_init(vm_offset_t boot_args_start) /* setup debugging output if one has been chosen */ PE_init_kprintf(FALSE); - if (!PE_parse_boot_arg("diag", &dgWork.dgFlags)) + if (!PE_parse_boot_argn("diag", &dgWork.dgFlags, sizeof (dgWork.dgFlags))) dgWork.dgFlags = 0; serialmode = 0; - if(PE_parse_boot_arg("serial", &serialmode)) { + if(PE_parse_boot_argn("serial", &serialmode, sizeof (serialmode))) { /* We want a serial keyboard and/or console */ kprintf("Serial mode specified: %08X\n", serialmode); } @@ -172,12 +171,12 @@ i386_init(vm_offset_t boot_args_start) kprintf("version_variant = %s\n", version_variant); kprintf("version = %s\n", version); - if (!PE_parse_boot_arg("maxmem", &maxmem)) - maxmemtouse=0; + if (!PE_parse_boot_argn("maxmem", &maxmem, sizeof (maxmem))) + maxmemtouse = 0; else maxmemtouse = ((uint64_t)maxmem) * (uint64_t)(1024 * 1024); - if (PE_parse_boot_arg("cpus", &cpus)) { + if (PE_parse_boot_argn("cpus", &cpus, sizeof (cpus))) { if ((0 < cpus) && (cpus < max_ncpus)) max_ncpus = cpus; } @@ -185,7 +184,7 @@ i386_init(vm_offset_t boot_args_start) /* * debug support for > 4G systems */ - if (!PE_parse_boot_arg("himemory_mode", &vm_himemory_mode)) + if (!PE_parse_boot_argn("himemory_mode", &vm_himemory_mode, sizeof (vm_himemory_mode))) vm_himemory_mode = 0; if (!PE_parse_boot_argn("immediate_NMI", &fidn, sizeof (fidn))) @@ -200,7 +199,7 @@ i386_init(vm_offset_t boot_args_start) boolean_t IA32e = FALSE; if (cpuid_extfeatures() & CPUID_EXTFEATURE_EM64T) { kprintf("EM64T supported"); - if (PE_parse_boot_arg("-legacy", &legacy_mode)) { + if (PE_parse_boot_argn("-legacy", &legacy_mode, sizeof (legacy_mode))) { kprintf(" but legacy mode forced\n"); } else { IA32e = TRUE; @@ -212,7 +211,7 @@ i386_init(vm_offset_t boot_args_start) nx_enabled = 0; /* Obtain "lcks" options:this currently controls lock statistics */ - if (!PE_parse_boot_arg("lcks", &LcksOpts)) + if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) LcksOpts = 0; /* @@ -221,11 +220,10 @@ i386_init(vm_offset_t boot_args_start) */ i386_vm_init(maxmemtouse, IA32e, kernelBootArgs); - if ( ! PE_parse_boot_arg("novmx", &noVMX)) + if ( ! PE_parse_boot_argn("novmx", &noVMX, sizeof (noVMX))) noVMX = 0; /* OK to support Altivec in rosetta? */ tsc_init(); - hpet_init(); power_management_init(); PE_init_platform(TRUE, kernelBootArgs); diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index d8ffd44e4..b5153c73d 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -421,6 +421,7 @@ LEAF_ENTRY(hw_lock_to) mov %edx,%edi rdtsc /* read cyclecount into %edx:%eax */ + lfence addl %ecx,%eax /* fetch and timeout */ adcl $0,%edx /* add carry */ mov %edx,%ecx @@ -442,6 +443,7 @@ LEAF_ENTRY(hw_lock_to) * Here after spinning INNER_LOOP_COUNT times, check for timeout */ rdtsc /* cyclecount into %edx:%eax */ + lfence cmpl %ecx,%edx /* compare high-order 32-bits */ jb 4b /* continue spinning if less, or */ cmpl %ebx,%eax /* compare low-order 32-bits */ diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 9123aa771..797022979 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -196,7 +196,7 @@ i386_vm_init(uint64_t maxmem, * Compute the memory size. */ - if ((1 == vm_himemory_mode) || PE_parse_boot_arg("-x", &safeboot)) { + if ((1 == vm_himemory_mode) || PE_parse_boot_argn("-x", &safeboot, sizeof (safeboot))) { maxpg = 1 << (32 - I386_PGSHIFT); } avail_remaining = 0; @@ -440,12 +440,12 @@ i386_vm_init(uint64_t maxmem, kprintf("Physical memory %llu MB\n", sane_size/MEG); - if (!PE_parse_boot_arg("max_valid_dma_addr", &maxdmaaddr)) + if (!PE_parse_boot_argn("max_valid_dma_addr", &maxdmaaddr, sizeof (maxdmaaddr))) max_valid_dma_address = 1024ULL * 1024ULL * 4096ULL; else max_valid_dma_address = ((uint64_t) maxdmaaddr) * 1024ULL * 1024ULL; - if (!PE_parse_boot_arg("maxbouncepool", &maxbouncepoolsize)) + if (!PE_parse_boot_argn("maxbouncepool", &maxbouncepoolsize, sizeof (maxbouncepoolsize))) maxbouncepoolsize = MAXBOUNCEPOOL; else maxbouncepoolsize = maxbouncepoolsize * (1024 * 1024); @@ -455,7 +455,7 @@ i386_vm_init(uint64_t maxmem, * in order to correctly determine the size of the mbuf pool * that will be reserved */ - if (!PE_parse_boot_arg("maxloreserve", &maxloreserve)) + if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof (maxloreserve))) maxloreserve = MAXLORESERVE + bsd_mbuf_cluster_reserve(); else maxloreserve = maxloreserve * (1024 * 1024); diff --git a/osfmk/i386/lapic.c b/osfmk/i386/lapic.c new file mode 100644 index 000000000..1dd1212db --- /dev/null +++ b/osfmk/i386/lapic.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2008 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if MACH_KDB +#include +#endif + +#include + +#if MP_DEBUG +#define PAUSE delay(1000000) +#define DBG(x...) kprintf(x) +#else +#define DBG(x...) +#define PAUSE +#endif /* MP_DEBUG */ + +/* Initialize lapic_id so cpu_number() works on non SMP systems */ +unsigned long lapic_id_initdata = 0; +unsigned long lapic_id = (unsigned long)&lapic_id_initdata; +vm_offset_t lapic_start; + +static i386_intr_func_t lapic_intr_func[LAPIC_FUNC_TABLE_SIZE]; + +/* TRUE if local APIC was enabled by the OS not by the BIOS */ +static boolean_t lapic_os_enabled = FALSE; + +/* Base vector for local APIC interrupt sources */ +int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; + +int lapic_to_cpu[MAX_CPUS]; +int cpu_to_lapic[MAX_CPUS]; + +static void +lapic_cpu_map_init(void) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) { + lapic_to_cpu[i] = -1; + cpu_to_lapic[i] = -1; + } +} + +void +lapic_cpu_map(int apic_id, int cpu) +{ + cpu_to_lapic[cpu] = apic_id; + lapic_to_cpu[apic_id] = cpu; +} + +/* + * Retrieve the local apic ID a cpu. + * + * Returns the local apic ID for the given processor. + * If the processor does not exist or apic not configured, returns -1. + */ + +uint32_t +ml_get_apicid(uint32_t cpu) +{ + if(cpu >= (uint32_t)MAX_CPUS) + return 0xFFFFFFFF; /* Return -1 if cpu too big */ + + /* Return the apic ID (or -1 if not configured) */ + return (uint32_t)cpu_to_lapic[cpu]; + +} + +#ifdef MP_DEBUG +static void +lapic_cpu_map_dump(void) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) { + if (cpu_to_lapic[i] == -1) + continue; + kprintf("cpu_to_lapic[%d]: %d\n", + i, cpu_to_lapic[i]); + } + for (i = 0; i < MAX_CPUS; i++) { + if (lapic_to_cpu[i] == -1) + continue; + kprintf("lapic_to_cpu[%d]: %d\n", + i, lapic_to_cpu[i]); + } +} +#endif /* MP_DEBUG */ + +void +lapic_init(void) +{ + int result; + vm_map_entry_t entry; + uint32_t lo; + uint32_t hi; + boolean_t is_boot_processor; + boolean_t is_lapic_enabled; + vm_offset_t lapic_base; + + /* Examine the local APIC state */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + is_boot_processor = (lo & MSR_IA32_APIC_BASE_BSP) != 0; + is_lapic_enabled = (lo & MSR_IA32_APIC_BASE_ENABLE) != 0; + lapic_base = (lo & MSR_IA32_APIC_BASE_BASE); + kprintf("MSR_IA32_APIC_BASE 0x%x %s %s\n", lapic_base, + is_lapic_enabled ? "enabled" : "disabled", + is_boot_processor ? "BSP" : "AP"); + if (!is_boot_processor || !is_lapic_enabled) + panic("Unexpected local APIC state\n"); + + /* Establish a map to the local apic */ + lapic_start = vm_map_min(kernel_map); + result = vm_map_find_space(kernel_map, + (vm_map_address_t *) &lapic_start, + round_page(LAPIC_SIZE), 0, + VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); + if (result != KERN_SUCCESS) { + panic("smp_init: vm_map_find_entry FAILED (err=%d)", result); + } + vm_map_unlock(kernel_map); +/* Map in the local APIC non-cacheable, as recommended by Intel + * in section 8.4.1 of the "System Programming Guide". + */ + pmap_enter(pmap_kernel(), + lapic_start, + (ppnum_t) i386_btop(lapic_base), + VM_PROT_READ|VM_PROT_WRITE, + VM_WIMG_IO, + TRUE); + lapic_id = (unsigned long)(lapic_start + LAPIC_ID); + + if ((LAPIC_READ(VERSION)&LAPIC_VERSION_MASK) < 0x14) { + printf("Local APIC version 0x%x, 0x14 or greater expected\n", + (LAPIC_READ(VERSION)&LAPIC_VERSION_MASK)); + } + + /* Set up the lapic_id <-> cpu_number map and add this boot processor */ + lapic_cpu_map_init(); + lapic_cpu_map((LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, 0); + kprintf("Boot cpu local APIC id 0x%x\n", cpu_to_lapic[0]); +} + + +static int +lapic_esr_read(void) +{ + /* write-read register */ + LAPIC_WRITE(ERROR_STATUS, 0); + return LAPIC_READ(ERROR_STATUS); +} + +static void +lapic_esr_clear(void) +{ + LAPIC_WRITE(ERROR_STATUS, 0); + LAPIC_WRITE(ERROR_STATUS, 0); +} + +static const char *DM_str[8] = { + "Fixed", + "Lowest Priority", + "Invalid", + "Invalid", + "NMI", + "Reset", + "Invalid", + "ExtINT"}; + +void +lapic_dump(void) +{ + int i; + +#define BOOL(a) ((a)?' ':'!') +#define VEC(lvt) \ + LAPIC_READ(lvt)&LAPIC_LVT_VECTOR_MASK +#define DS(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_DS_PENDING)?" SendPending" : "Idle" +#define DM(lvt) \ + DM_str[(LAPIC_READ(lvt)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK] +#define MASK(lvt) \ + BOOL(LAPIC_READ(lvt)&LAPIC_LVT_MASKED) +#define TM(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_TM_LEVEL)? "Level" : "Edge" +#define IP(lvt) \ + (LAPIC_READ(lvt)&LAPIC_LVT_IP_PLRITY_LOW)? "Low " : "High" + + kprintf("LAPIC %d at 0x%x version 0x%x\n", + (LAPIC_READ(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, + lapic_start, + LAPIC_READ(VERSION)&LAPIC_VERSION_MASK); + kprintf("Priorities: Task 0x%x Arbitration 0x%x Processor 0x%x\n", + LAPIC_READ(TPR)&LAPIC_TPR_MASK, + LAPIC_READ(APR)&LAPIC_APR_MASK, + LAPIC_READ(PPR)&LAPIC_PPR_MASK); + kprintf("Destination Format 0x%x Logical Destination 0x%x\n", + LAPIC_READ(DFR)>>LAPIC_DFR_SHIFT, + LAPIC_READ(LDR)>>LAPIC_LDR_SHIFT); + kprintf("%cEnabled %cFocusChecking SV 0x%x\n", + BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE), + BOOL(!(LAPIC_READ(SVR)&LAPIC_SVR_FOCUS_OFF)), + LAPIC_READ(SVR) & LAPIC_SVR_MASK); + kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", + VEC(LVT_TIMER), + DS(LVT_TIMER), + MASK(LVT_TIMER), + (LAPIC_READ(LVT_TIMER)&LAPIC_LVT_PERIODIC)?"Periodic":"OneShot"); + kprintf(" Initial Count: 0x%08x \n", LAPIC_READ(TIMER_INITIAL_COUNT)); + kprintf(" Current Count: 0x%08x \n", LAPIC_READ(TIMER_CURRENT_COUNT)); + kprintf(" Divide Config: 0x%08x \n", LAPIC_READ(TIMER_DIVIDE_CONFIG)); + kprintf("LVT_PERFCNT: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_PERFCNT), + DM(LVT_PERFCNT), + DS(LVT_PERFCNT), + MASK(LVT_PERFCNT)); + kprintf("LVT_THERMAL: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_THERMAL), + DM(LVT_THERMAL), + DS(LVT_THERMAL), + MASK(LVT_THERMAL)); + kprintf("LVT_LINT0: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", + VEC(LVT_LINT0), + DM(LVT_LINT0), + TM(LVT_LINT0), + IP(LVT_LINT0), + DS(LVT_LINT0), + MASK(LVT_LINT0)); + kprintf("LVT_LINT1: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", + VEC(LVT_LINT1), + DM(LVT_LINT1), + TM(LVT_LINT1), + IP(LVT_LINT1), + DS(LVT_LINT1), + MASK(LVT_LINT1)); + kprintf("LVT_ERROR: Vector 0x%02x %s %cmasked\n", + VEC(LVT_ERROR), + DS(LVT_ERROR), + MASK(LVT_ERROR)); + kprintf("ESR: %08x \n", lapic_esr_read()); + kprintf(" "); + for(i=0xf; i>=0; i--) + kprintf("%x%x%x%x",i,i,i,i); + kprintf("\n"); + kprintf("TMR: 0x"); + for(i=7; i>=0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(TMR_BASE, i*0x10)); + kprintf("\n"); + kprintf("IRR: 0x"); + for(i=7; i>=0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(IRR_BASE, i*0x10)); + kprintf("\n"); + kprintf("ISR: 0x"); + for(i=7; i >= 0; i--) + kprintf("%08x",LAPIC_READ_OFFSET(ISR_BASE, i*0x10)); + kprintf("\n"); +} + +#if MACH_KDB +/* + * Displays apic junk + * + * da + */ +void +db_apic(__unused db_expr_t addr, + __unused int have_addr, + __unused db_expr_t count, + __unused char *modif) +{ + + lapic_dump(); + + return; +} + +#endif + +boolean_t +lapic_probe(void) +{ + uint32_t lo; + uint32_t hi; + + if (cpuid_features() & CPUID_FEATURE_APIC) + return TRUE; + + if (cpuid_family() == 6 || cpuid_family() == 15) { + /* + * Mobile Pentiums: + * There may be a local APIC which wasn't enabled by BIOS. + * So we try to enable it explicitly. + */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + lo &= ~MSR_IA32_APIC_BASE_BASE; + lo |= MSR_IA32_APIC_BASE_ENABLE | LAPIC_START; + lo |= MSR_IA32_APIC_BASE_ENABLE; + wrmsr(MSR_IA32_APIC_BASE, lo, hi); + + /* + * Re-initialize cpu features info and re-check. + */ + cpuid_set_info(); + if (cpuid_features() & CPUID_FEATURE_APIC) { + printf("Local APIC discovered and enabled\n"); + lapic_os_enabled = TRUE; + lapic_interrupt_base = LAPIC_REDUCED_INTERRUPT_BASE; + return TRUE; + } + } + + return FALSE; +} + +void +lapic_shutdown(void) +{ + uint32_t lo; + uint32_t hi; + uint32_t value; + + /* Shutdown if local APIC was enabled by OS */ + if (lapic_os_enabled == FALSE) + return; + + mp_disable_preemption(); + + /* ExtINT: masked */ + if (get_cpu_number() == master_cpu) { + value = LAPIC_READ(LVT_LINT0); + value |= LAPIC_LVT_MASKED; + LAPIC_WRITE(LVT_LINT0, value); + } + + /* Timer: masked */ + LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) | LAPIC_LVT_MASKED); + + /* Perfmon: masked */ + LAPIC_WRITE(LVT_PERFCNT, LAPIC_READ(LVT_PERFCNT) | LAPIC_LVT_MASKED); + + /* Error: masked */ + LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); + + /* APIC software disabled */ + LAPIC_WRITE(SVR, LAPIC_READ(SVR) & ~LAPIC_SVR_ENABLE); + + /* Bypass the APIC completely and update cpu features */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + lo &= ~MSR_IA32_APIC_BASE_ENABLE; + wrmsr(MSR_IA32_APIC_BASE, lo, hi); + cpuid_set_info(); + + mp_enable_preemption(); +} + +void +lapic_configure(void) +{ + int value; + + /* Set flat delivery model, logical processor id */ + LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); + LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); + + /* Accept all */ + LAPIC_WRITE(TPR, 0); + + LAPIC_WRITE(SVR, LAPIC_VECTOR(SPURIOUS) | LAPIC_SVR_ENABLE); + + /* ExtINT */ + if (get_cpu_number() == master_cpu) { + value = LAPIC_READ(LVT_LINT0); + value &= ~LAPIC_LVT_MASKED; + value |= LAPIC_LVT_DM_EXTINT; + LAPIC_WRITE(LVT_LINT0, value); + } + + /* Timer: unmasked, one-shot */ + LAPIC_WRITE(LVT_TIMER, LAPIC_VECTOR(TIMER)); + + /* Perfmon: unmasked */ + LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); + + /* Thermal: unmasked */ + LAPIC_WRITE(LVT_THERMAL, LAPIC_VECTOR(THERMAL)); + + lapic_esr_clear(); + + LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR)); +} + +void +lapic_set_timer( + boolean_t interrupt, + lapic_timer_mode_t mode, + lapic_timer_divide_t divisor, + lapic_timer_count_t initial_count) +{ + boolean_t state; + uint32_t timer_vector; + + state = ml_set_interrupts_enabled(FALSE); + timer_vector = LAPIC_READ(LVT_TIMER); + timer_vector &= ~(LAPIC_LVT_MASKED|LAPIC_LVT_PERIODIC);; + timer_vector |= interrupt ? 0 : LAPIC_LVT_MASKED; + timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; + LAPIC_WRITE(LVT_TIMER, timer_vector); + LAPIC_WRITE(TIMER_DIVIDE_CONFIG, divisor); + LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); + ml_set_interrupts_enabled(state); +} + +void +lapic_get_timer( + lapic_timer_mode_t *mode, + lapic_timer_divide_t *divisor, + lapic_timer_count_t *initial_count, + lapic_timer_count_t *current_count) +{ + boolean_t state; + + state = ml_set_interrupts_enabled(FALSE); + if (mode) + *mode = (LAPIC_READ(LVT_TIMER) & LAPIC_LVT_PERIODIC) ? + periodic : one_shot; + if (divisor) + *divisor = LAPIC_READ(TIMER_DIVIDE_CONFIG) & LAPIC_TIMER_DIVIDE_MASK; + if (initial_count) + *initial_count = LAPIC_READ(TIMER_INITIAL_COUNT); + if (current_count) + *current_count = LAPIC_READ(TIMER_CURRENT_COUNT); + ml_set_interrupts_enabled(state); +} + +static inline void +_lapic_end_of_interrupt(void) +{ + LAPIC_WRITE(EOI, 0); +} + +void +lapic_end_of_interrupt(void) +{ + _lapic_end_of_interrupt(); +} + +void +lapic_set_intr_func(int vector, i386_intr_func_t func) +{ + if (vector > lapic_interrupt_base) + vector -= lapic_interrupt_base; + + switch (vector) { + case LAPIC_NMI_INTERRUPT: + case LAPIC_INTERPROCESSOR_INTERRUPT: + case LAPIC_TIMER_INTERRUPT: + case LAPIC_THERMAL_INTERRUPT: + case LAPIC_PERFCNT_INTERRUPT: + lapic_intr_func[vector] = func; + break; + default: + panic("lapic_set_intr_func(%d,%p) invalid vector\n", + vector, func); + } +} + +int +lapic_interrupt(int interrupt, x86_saved_state_t *state) +{ + int retval = 0; + + interrupt -= lapic_interrupt_base; + if (interrupt < 0) { + if (interrupt == (LAPIC_NMI_INTERRUPT - lapic_interrupt_base) && + lapic_intr_func[LAPIC_NMI_INTERRUPT] != NULL) { + retval = (*lapic_intr_func[LAPIC_NMI_INTERRUPT])(state); + _lapic_end_of_interrupt(); + return retval; + } + else + return 0; + } + + switch(interrupt) { + case LAPIC_TIMER_INTERRUPT: + case LAPIC_THERMAL_INTERRUPT: + case LAPIC_INTERPROCESSOR_INTERRUPT: + if (lapic_intr_func[interrupt] != NULL) + (void) (*lapic_intr_func[interrupt])(state); + if (interrupt == LAPIC_PERFCNT_INTERRUPT) + LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); + _lapic_end_of_interrupt(); + retval = 1; + break; + case LAPIC_ERROR_INTERRUPT: + lapic_dump(); + panic("Local APIC error\n"); + _lapic_end_of_interrupt(); + retval = 1; + break; + case LAPIC_SPURIOUS_INTERRUPT: + kprintf("SPIV\n"); + /* No EOI required here */ + retval = 1; + break; + } + + return retval; +} + +void +lapic_smm_restore(void) +{ + boolean_t state; + + if (lapic_os_enabled == FALSE) + return; + + state = ml_set_interrupts_enabled(FALSE); + + if (LAPIC_ISR_IS_SET(LAPIC_REDUCED_INTERRUPT_BASE, TIMER)) { + /* + * Bogus SMI handler enables interrupts but does not know about + * local APIC interrupt sources. When APIC timer counts down to + * zero while in SMM, local APIC will end up waiting for an EOI + * but no interrupt was delivered to the OS. + */ + _lapic_end_of_interrupt(); + + /* + * timer is one-shot, trigger another quick countdown to trigger + * another timer interrupt. + */ + if (LAPIC_READ(TIMER_CURRENT_COUNT) == 0) { + LAPIC_WRITE(TIMER_INITIAL_COUNT, 1); + } + + kprintf("lapic_smm_restore\n"); + } + + ml_set_interrupts_enabled(state); +} + diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h new file mode 100644 index 000000000..4fa855676 --- /dev/null +++ b/osfmk/i386/lapic.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2008 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + * + */ +#ifndef _I386_LAPIC_H_ +#define _I386_LAPIC_H_ + +#define LAPIC_START 0xFEE00000 +#define LAPIC_SIZE 0x00000400 + +#define LAPIC_ID 0x00000020 +#define LAPIC_ID_SHIFT 24 +#define LAPIC_ID_MASK 0xFF +#define LAPIC_VERSION 0x00000030 +#define LAPIC_VERSION_MASK 0xFF +#define LAPIC_TPR 0x00000080 +#define LAPIC_TPR_MASK 0xFF +#define LAPIC_APR 0x00000090 +#define LAPIC_APR_MASK 0xFF +#define LAPIC_PPR 0x000000A0 +#define LAPIC_PPR_MASK 0xFF +#define LAPIC_EOI 0x000000B0 +#define LAPIC_REMOTE_READ 0x000000C0 +#define LAPIC_LDR 0x000000D0 +#define LAPIC_LDR_SHIFT 24 +#define LAPIC_DFR 0x000000E0 +#define LAPIC_DFR_FLAT 0xFFFFFFFF +#define LAPIC_DFR_CLUSTER 0x0FFFFFFF +#define LAPIC_DFR_SHIFT 28 +#define LAPIC_SVR 0x000000F0 +#define LAPIC_SVR_MASK 0x0FF +#define LAPIC_SVR_ENABLE 0x100 +#define LAPIC_SVR_FOCUS_OFF 0x200 +#define LAPIC_ISR_BASE 0x00000100 +#define LAPIC_TMR_BASE 0x00000180 +#define LAPIC_IRR_BASE 0x00000200 +#define LAPIC_ERROR_STATUS 0x00000280 +#define LAPIC_ICR 0x00000300 +#define LAPIC_ICR_VECTOR_MASK 0x000FF +#define LAPIC_ICR_DM_MASK 0x00700 +#define LAPIC_ICR_DM_FIXED 0x00000 +#define LAPIC_ICR_DM_LOWEST 0x00100 +#define LAPIC_ICR_DM_SMI 0x00200 +#define LAPIC_ICR_DM_REMOTE 0x00300 +#define LAPIC_ICR_DM_NMI 0x00400 +#define LAPIC_ICR_DM_INIT 0x00500 +#define LAPIC_ICR_DM_STARTUP 0x00600 +#define LAPIC_ICR_DM_LOGICAL 0x00800 +#define LAPIC_ICR_DS_PENDING 0x01000 +#define LAPIC_ICR_LEVEL_ASSERT 0x04000 +#define LAPIC_ICR_TRIGGER_LEVEL 0x08000 +#define LAPIC_ICR_RR_MASK 0x30000 +#define LAPIC_ICR_RR_INVALID 0x00000 +#define LAPIC_ICR_RR_INPROGRESS 0x10000 +#define LAPIC_ICR_RR_VALID 0x20000 +#define LAPIC_ICR_DSS_MASK 0xC0000 +#define LAPIC_ICR_DSS_DEST 0x00000 +#define LAPIC_ICR_DSS_SELF 0x40000 +#define LAPIC_ICR_DSS_ALL 0x80000 +#define LAPIC_ICR_DSS_OTHERS 0xC0000 +#define LAPIC_ICRD 0x00000310 +#define LAPIC_ICRD_DEST_SHIFT 24 +#define LAPIC_LVT_TIMER 0x00000320 +#define LAPIC_LVT_THERMAL 0x00000330 +#define LAPIC_LVT_PERFCNT 0x00000340 +#define LAPIC_LVT_LINT0 0x00000350 +#define LAPIC_LVT_LINT1 0x00000360 +#define LAPIC_LVT_ERROR 0x00000370 +#define LAPIC_LVT_VECTOR_MASK 0x000FF +#define LAPIC_LVT_DM_SHIFT 8 +#define LAPIC_LVT_DM_MASK 0x00007 +#define LAPIC_LVT_DM_FIXED 0x00000 +#define LAPIC_LVT_DM_NMI 0x00400 +#define LAPIC_LVT_DM_EXTINT 0x00700 +#define LAPIC_LVT_DS_PENDING 0x01000 +#define LAPIC_LVT_IP_PLRITY_LOW 0x02000 +#define LAPIC_LVT_REMOTE_IRR 0x04000 +#define LAPIC_LVT_TM_LEVEL 0x08000 +#define LAPIC_LVT_MASKED 0x10000 +#define LAPIC_LVT_PERIODIC 0x20000 +#define LAPIC_TIMER_INITIAL_COUNT 0x00000380 +#define LAPIC_TIMER_CURRENT_COUNT 0x00000390 +#define LAPIC_TIMER_DIVIDE_CONFIG 0x000003E0 +/* divisor encoded by bits 0,1,3 with bit 2 always 0: */ +#define LAPIC_TIMER_DIVIDE_MASK 0x0000000F +#define LAPIC_TIMER_DIVIDE_2 0x00000000 +#define LAPIC_TIMER_DIVIDE_4 0x00000001 +#define LAPIC_TIMER_DIVIDE_8 0x00000002 +#define LAPIC_TIMER_DIVIDE_16 0x00000003 +#define LAPIC_TIMER_DIVIDE_32 0x00000008 +#define LAPIC_TIMER_DIVIDE_64 0x00000009 +#define LAPIC_TIMER_DIVIDE_128 0x0000000A +#define LAPIC_TIMER_DIVIDE_1 0x0000000B + +#define LAPIC_ID_MAX (LAPIC_ID_MASK) + +#define CPU_NUMBER(r) \ + movl %gs:CPU_NUMBER_GS,r + +#define CPU_NUMBER_FROM_LAPIC(r) \ + movl EXT(lapic_id),r; \ + movl 0(r),r; \ + shrl $(LAPIC_ID_SHIFT),r; \ + andl $(LAPIC_ID_MASK),r; \ + movl EXT(lapic_to_cpu)(,r,4),r + +#ifndef ASSEMBLER +#include +#include +#include +#include +typedef enum { + periodic, + one_shot +} lapic_timer_mode_t; +typedef enum { + divide_by_1 = LAPIC_TIMER_DIVIDE_1, + divide_by_2 = LAPIC_TIMER_DIVIDE_2, + divide_by_4 = LAPIC_TIMER_DIVIDE_4, + divide_by_8 = LAPIC_TIMER_DIVIDE_8, + divide_by_16 = LAPIC_TIMER_DIVIDE_16, + divide_by_32 = LAPIC_TIMER_DIVIDE_32, + divide_by_64 = LAPIC_TIMER_DIVIDE_64, + divide_by_128 = LAPIC_TIMER_DIVIDE_128 +} lapic_timer_divide_t; +typedef uint32_t lapic_timer_count_t; + +/* + * By default, use high vectors to leave vector space for systems + * with multiple I/O APIC's. However some systems that boot with + * local APIC disabled will hang in SMM when vectors greater than + * 0x5F are used. Those systems are not expected to have I/O APIC + * so 16 (0x50 - 0x40) vectors for legacy PIC support is perfect. + */ +#define LAPIC_DEFAULT_INTERRUPT_BASE 0xD0 +#define LAPIC_REDUCED_INTERRUPT_BASE 0x50 +/* + * Specific lapic interrupts are relative to this base + * in priority order from high to low: + */ + +#define LAPIC_PERFCNT_INTERRUPT 0xF +#define LAPIC_TIMER_INTERRUPT 0xE +#define LAPIC_INTERPROCESSOR_INTERRUPT 0xD +#define LAPIC_THERMAL_INTERRUPT 0xC +#define LAPIC_ERROR_INTERRUPT 0xB +#define LAPIC_SPURIOUS_INTERRUPT 0xA +#define LAPIC_CMCI_INTERRUPT 0x9 +/* The vector field is ignored for NMI interrupts via the LAPIC + * or otherwise, so this is not an offset from the interrupt + * base. + */ +#define LAPIC_NMI_INTERRUPT 0x2 +#define LAPIC_FUNC_TABLE_SIZE LAPIC_PERFCNT_INTERRUPT + +#define LAPIC_WRITE(reg,val) \ + *((volatile uint32_t *)(lapic_start + LAPIC_##reg)) = (val) +#define LAPIC_READ(reg) \ + (*((volatile uint32_t *)(lapic_start + LAPIC_##reg))) +#define LAPIC_READ_OFFSET(reg,off) \ + (*((volatile uint32_t *)(lapic_start + LAPIC_##reg + (off)))) + +#define LAPIC_VECTOR(src) \ + (lapic_interrupt_base + LAPIC_##src##_INTERRUPT) + +#define LAPIC_ISR_IS_SET(base,src) \ + (LAPIC_READ_OFFSET(ISR_BASE,((base+LAPIC_##src##_INTERRUPT)/32)*0x10) \ + & (1 <<((base + LAPIC_##src##_INTERRUPT)%32))) + +extern vm_offset_t lapic_start; + +extern void lapic_init(void); +extern void lapic_configure(void); +extern void lapic_shutdown(void); +extern void lapic_smm_restore(void); +extern boolean_t lapic_probe(void); +extern void lapic_dump(void); +extern int lapic_interrupt( + int interrupt, x86_saved_state_t *state); +extern void lapic_end_of_interrupt(void); +extern int lapic_to_cpu[]; +extern int cpu_to_lapic[]; +extern int lapic_interrupt_base; +extern void lapic_cpu_map(int lapic, int cpu_num); +extern uint32_t ml_get_apicid(uint32_t cpu); + +extern void lapic_set_timer( + boolean_t interrupt, + lapic_timer_mode_t mode, + lapic_timer_divide_t divisor, + lapic_timer_count_t initial_count); + +extern void lapic_get_timer( + lapic_timer_mode_t *mode, + lapic_timer_divide_t *divisor, + lapic_timer_count_t *initial_count, + lapic_timer_count_t *current_count); + +typedef int (*i386_intr_func_t)(x86_saved_state_t *state); +extern void lapic_set_intr_func(int intr, i386_intr_func_t func); + +static inline void lapic_set_timer_func(i386_intr_func_t func) +{ + lapic_set_intr_func(LAPIC_VECTOR(TIMER), func); +} +static inline void lapic_set_pmi_func(i386_intr_func_t func) +{ + lapic_set_intr_func(LAPIC_VECTOR(PERFCNT), func); +} +static inline void lapic_set_thermal_func(i386_intr_func_t func) +{ + lapic_set_intr_func(LAPIC_VECTOR(THERMAL), func); +} + +#ifdef MP_DEBUG +#define LAPIC_CPU_MAP_DUMP() lapic_cpu_map_dump() +#define LAPIC_DUMP() lapic_dump() +#else +#define LAPIC_CPU_MAP_DUMP() +#define LAPIC_DUMP() +#endif /* MP_DEBUG */ + +#endif /* ASSEMBLER */ + +#endif /* _I386_LAPIC_H_ */ + diff --git a/osfmk/i386/locore.s b/osfmk/i386/locore.s index 9fa0eb36f..5b57ee4cc 100644 --- a/osfmk/i386/locore.s +++ b/osfmk/i386/locore.s @@ -66,6 +66,8 @@ #include #include #include +#include +#include #include #include #include @@ -232,34 +234,12 @@ Entry(timer_grab) * Nanotime returned in %edx:%eax. * Computed from tsc based on the scale factor * and an implicit 32 bit shift. - * This code must match what _rtc_nanotime_read does in - * i386/machine_routines_asm.s. Failure to do so can - * result in "weird" timing results. * * Uses %eax, %ebx, %ecx, %edx, %esi, %edi. */ -#define RNT_INFO _rtc_nanotime_info #define NANOTIME \ - lea RNT_INFO,%edi ; \ -0: ; \ - movl RNT_GENERATION(%edi),%esi /* being updated? */ ; \ - testl %esi,%esi ; \ - jz 0b /* wait until done */ ; \ - rdtsc ; \ - subl RNT_TSC_BASE(%edi),%eax ; \ - sbbl RNT_TSC_BASE+4(%edi),%edx /* tsc - tsc_base */ ; \ - movl RNT_SCALE(%edi),%ecx /* * scale factor */ ; \ - movl %edx,%ebx ; \ - mull %ecx ; \ - movl %ebx,%eax ; \ - movl %edx,%ebx ; \ - mull %ecx ; \ - addl %ebx,%eax ; \ - adcl $0,%edx ; \ - addl RNT_NS_BASE(%edi),%eax /* + ns_base */ ; \ - adcl RNT_NS_BASE+4(%edi),%edx ; \ - cmpl RNT_GENERATION(%edi),%esi /* check for update */ ; \ - jne 0b /* do it all again */ + mov %gs:CPU_NANOTIME,%edi ; \ + RTC_NANOTIME_READ_FAST() /* diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 79adff827..23f26fc50 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -27,8 +27,12 @@ */ #include +#include #include #include +#include +#include +#include #include #include @@ -44,8 +48,6 @@ static boolean_t mca_threshold_status_present = FALSE; static boolean_t mca_extended_MSRs_present = FALSE; static unsigned int mca_extended_MSRs_count = 0; static ia32_mcg_cap_t ia32_mcg_cap; -static boolean_t mca_exception_taken = FALSE; - decl_simple_lock_data(static, mca_lock); typedef struct { @@ -61,6 +63,13 @@ typedef struct mca_state { mca_mci_bank_t mca_error_bank[0]; } mca_state_t; +typedef enum { + CLEAR, + DUMPING, + DUMPED +} mca_dump_state_t; +static volatile mca_dump_state_t mca_dump_state = CLEAR; + static void mca_get_availability(void) { @@ -161,15 +170,13 @@ mca_cpu_alloc(cpu_data_t *cdp) } static void -mca_save_state(void) +mca_save_state(mca_state_t *mca_state) { - mca_state_t *mca_state; mca_mci_bank_t *bank; unsigned int i; assert(!ml_get_interrupts_enabled() || get_preemption_level() > 0); - mca_state = (mca_state_t *) current_cpu_datap()->cpu_mca_state; if (mca_state == NULL) return; @@ -193,8 +200,8 @@ mca_save_state(void) void mca_check_save(void) { - if (mca_exception_taken) - mca_save_state(); + if (mca_dump_state > CLEAR) + mca_save_state(current_cpu_datap()->cpu_mca_state); } static void mca_dump_64bit_state(void) @@ -250,15 +257,14 @@ mca_report_cpu_info(void) // microcode revision is top 32 bits of MSR_IA32_UCODE_REV microcode = rdmsr64(MSR_IA32_UCODE_REV) >> 32; - kdb_printf("family: %d model: %d stepping: %d microcode revision %d\n", + kdb_printf(" family: %d model: %d stepping: %d microcode: %d\n", infop->cpuid_family, infop->cpuid_model, infop->cpuid_stepping, (uint32_t) microcode); - kdb_printf("%s\n", infop->cpuid_brand_string); + kdb_printf(" %s\n", infop->cpuid_brand_string); } - static const char *mca_threshold_status[] = { [THRESHOLD_STATUS_NO_TRACKING] "No tracking", [THRESHOLD_STATUS_GREEN] "Green", @@ -267,56 +273,65 @@ static const char *mca_threshold_status[] = { }; static void -mca_dump_error_banks(void) +mca_dump_bank(mca_state_t *state, int i) { - unsigned int i; + mca_mci_bank_t *bank; ia32_mci_status_t status; - kdb_printf("MCA error-reporting registers:\n"); - for (i = 0; i < mca_error_bank_count; i++ ) { - status.u64 = rdmsr64(IA32_MCi_STATUS(i)); + bank = &state->mca_error_bank[i]; + status = bank->mca_mci_status; + kdb_printf( + " IA32_MC%d_STATUS(0x%x): 0x%016qx %svalid\n", + i, IA32_MCi_STATUS(i), status.u64, IF(!status.bits.val, "in")); + if (!status.bits.val) + return; + + kdb_printf( + " MCA error code: 0x%04x\n", + status.bits.mca_error); + kdb_printf( + " Model specific error code: 0x%04x\n", + status.bits.model_specific_error); + if (!mca_threshold_status_present) { kdb_printf( - " IA32_MC%d_STATUS(0x%x): 0x%016qx %svalid\n", - i, IA32_MCi_STATUS(i), status.u64, - IF(!status.bits.val, "in")); - if (!status.bits.val) - continue; + " Other information: 0x%08x\n", + status.bits.other_information); + } else { + int threshold = status.bits_tes_p.threshold; kdb_printf( - " MCA error code : 0x%04x\n", - status.bits.mca_error); + " Other information: 0x%08x\n" + " Threshold-based status: %s\n", + status.bits_tes_p.other_information, + (status.bits_tes_p.uc == 0) ? + mca_threshold_status[threshold] : + "Undefined"); + } + kdb_printf( + " Status bits:\n%s%s%s%s%s%s", + IF(status.bits.pcc, " Processor context corrupt\n"), + IF(status.bits.addrv, " ADDR register valid\n"), + IF(status.bits.miscv, " MISC register valid\n"), + IF(status.bits.en, " Error enabled\n"), + IF(status.bits.uc, " Uncorrected error\n"), + IF(status.bits.over, " Error overflow\n")); + if (status.bits.addrv) kdb_printf( - " Model specific error code: 0x%04x\n", - status.bits.model_specific_error); - if (!mca_threshold_status_present) { - kdb_printf( - " Other information : 0x%08x\n", - status.bits.other_information); - } else { - int threshold = status.bits_tes_p.threshold; - kdb_printf( - " Other information : 0x%08x\n" - " Threshold-based status : %s\n", - status.bits_tes_p.other_information, - (status.bits_tes_p.uc == 0) ? - mca_threshold_status[threshold] : - "Undefined"); - } + " IA32_MC%d_ADDR(0x%x): 0x%016qx\n", + i, IA32_MCi_ADDR(i), bank->mca_mci_addr); + if (status.bits.miscv) kdb_printf( - " Status bits:\n%s%s%s%s%s%s", - IF(status.bits.pcc, " Processor context corrupt\n"), - IF(status.bits.addrv, " ADDR register valid\n"), - IF(status.bits.miscv, " MISC register valid\n"), - IF(status.bits.en, " Error enabled\n"), - IF(status.bits.uc, " Uncorrected error\n"), - IF(status.bits.over, " Error overflow\n")); - if (status.bits.addrv) - kdb_printf( - " IA32_MC%d_ADDR(0x%x): 0x%016qx\n", - i, IA32_MCi_ADDR(i), rdmsr64(IA32_MCi_ADDR(i))); - if (status.bits.miscv) - kdb_printf( - " IA32_MC%d_MISC(0x%x): 0x%016qx\n", - i, IA32_MCi_MISC(i), rdmsr64(IA32_MCi_MISC(i))); + " IA32_MC%d_MISC(0x%x): 0x%016qx\n", + i, IA32_MCi_MISC(i), bank->mca_mci_misc); +} + +static void +mca_dump_error_banks(mca_state_t *state) +{ + unsigned int i; + + kdb_printf("MCA error-reporting registers:\n"); + for (i = 0; i < mca_error_bank_count; i++ ) { + mca_dump_bank(state, i); } } @@ -324,19 +339,27 @@ void mca_dump(void) { ia32_mcg_status_t status; + mca_state_t *mca_state = current_cpu_datap()->cpu_mca_state; - mca_save_state(); + /* + * Capture local MCA registers to per-cpu data. + */ + mca_save_state(mca_state); /* * Serialize in case of multiple simultaneous machine-checks. - * Only the first caller is allowed to print MCA registers. + * Only the first caller is allowed to dump MCA registers, + * other threads spin meantime. */ simple_lock(&mca_lock); - if (mca_exception_taken) { + if (mca_dump_state > CLEAR) { simple_unlock(&mca_lock); + while (mca_dump_state == DUMPING) + cpu_pause(); return; } - mca_exception_taken = TRUE; + mca_dump_state = DUMPING; + simple_unlock(&mca_lock); /* * Report machine-check capabilities: @@ -348,11 +371,12 @@ mca_dump(void) mca_report_cpu_info(); kdb_printf( - " %d error-reporting banks\n%s%s", mca_error_bank_count, + " %d error-reporting banks\n%s%s%s", mca_error_bank_count, IF(mca_control_MSR_present, " control MSR present\n"), IF(mca_threshold_status_present, - " threshold-based error status present\n")); + " threshold-based error status present\n"), + ""); if (mca_extended_MSRs_present) kdb_printf( " %d extended MSRs present\n", mca_extended_MSRs_count); @@ -362,7 +386,7 @@ mca_dump(void) */ status.u64 = rdmsr64(IA32_MCG_STATUS); kdb_printf( - "Machine-check status 0x%016qx\n%s%s%s", status.u64, + "Machine-check status 0x%016qx:\n%s%s%s", status.u64, IF(status.bits.ripv, " restart IP valid\n"), IF(status.bits.eipv, " error IP valid\n"), IF(status.bits.mcip, " machine-check in progress\n")); @@ -370,7 +394,7 @@ mca_dump(void) /* * Dump error-reporting registers: */ - mca_dump_error_banks(); + mca_dump_error_banks(mca_state); /* * Dump any extended machine state: @@ -382,5 +406,6 @@ mca_dump(void) mca_dump_32bit_state(); } - simple_unlock(&mca_lock); + /* Update state to release any other threads. */ + mca_dump_state = DUMPED; } diff --git a/osfmk/i386/machine_check.h b/osfmk/i386/machine_check.h index 853fe36f4..233e78e2c 100644 --- a/osfmk/i386/machine_check.h +++ b/osfmk/i386/machine_check.h @@ -49,11 +49,10 @@ typedef union { uint64_t count :BITS(7,0); uint64_t mcg_ctl_p :BIT1(8); uint64_t mcg_ext_p :BIT1(9); - uint64_t reserved1 :BIT1(10); + uint64_t mcg_reserved1 :BIT1(10); uint64_t mcg_tes_p :BIT1(11); - uint64_t reserved2 :BITS(15,12); + uint64_t mcg_reserved2 :BITS(15,12); uint64_t mcg_ext_cnt :BITS(23,16); - uint64_t reserved3 :BITS(63,24); } bits; uint64_t u64; } ia32_mcg_cap_t; @@ -64,7 +63,6 @@ typedef union { uint64_t ripv :BIT1(0); uint64_t eipv :BIT1(1); uint64_t mcip :BIT1(2); - uint64_t reserved :BITS(61,3); } bits; uint64_t u64; } ia32_mcg_status_t; @@ -113,7 +111,7 @@ typedef uint64_t ia32_mci_ctl_t; #define IA32_MCi_CTL_ENABLE_ALL (0xFFFFFFFFFFFFFFFFULL) typedef union { - struct { + struct { uint64_t mca_error :BITS(15,0); uint64_t model_specific_error :BITS(31,16); uint64_t other_information :BITS(56,32); @@ -124,13 +122,12 @@ typedef union { uint64_t uc :BIT1(61); uint64_t over :BIT1(62); uint64_t val :BIT1(63); - } bits; - struct { /* Variant if threshold-based error status present: */ + } bits; + struct { /* Variant if threshold-based error status present: */ uint64_t mca_error :BITS(15,0); uint64_t model_specific_error :BITS(31,16); uint64_t other_information :BITS(52,32); uint64_t threshold :BITS(54,53); - uint64_t reserved :BITS(56,55); uint64_t pcc :BIT1(57); uint64_t addrv :BIT1(58); uint64_t miscv :BIT1(59); @@ -138,8 +135,8 @@ typedef union { uint64_t uc :BIT1(61); uint64_t over :BIT1(62); uint64_t val :BIT1(63); - } bits_tes_p; - uint64_t u64; + } bits_tes_p; + uint64_t u64; } ia32_mci_status_t; /* Values for threshold_status if mcg_tes_p == 1 and uc == 0 */ @@ -151,7 +148,6 @@ typedef union { typedef uint64_t ia32_mci_addr_t; typedef uint64_t ia32_mci_misc_t; - #define IA32_MCG_EAX (0x180) #define IA32_MCG_EBX (0x181) #define IA32_MCG_ECX (0x182) @@ -189,10 +185,10 @@ typedef uint64_t ia32_mci_misc_t; #define IA32_MCG_R14 (0x196) #define IA32_MCG_R15 (0x197) -extern void mca_cpu_alloc(cpu_data_t *cdp); -extern void mca_cpu_init(void); -extern void mca_dump(void); -extern void mca_check_save(void); +extern void mca_cpu_alloc(cpu_data_t *cdp); +extern void mca_cpu_init(void); +extern void mca_dump(void); +extern void mca_check_save(void); #endif /* _I386_MACHINE_CHECK_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/machine_cpu.h b/osfmk/i386/machine_cpu.h index b3143b07e..2460bf606 100644 --- a/osfmk/i386/machine_cpu.h +++ b/osfmk/i386/machine_cpu.h @@ -38,9 +38,6 @@ __BEGIN_DECLS void cpu_machine_init( void); -void cpu_signal_handler( - x86_saved_state_t *regs); - void handle_pending_TLB_flushes( void); diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 4ffb4ddc3..d42f6d2f1 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include @@ -292,39 +292,6 @@ void ml_install_interrupt_handler( } -void -machine_idle(void) -{ - x86_core_t *my_core = x86_core(); - cpu_data_t *my_cpu = current_cpu_datap(); - int others_active; - - /* - * We halt this cpu thread - * unless kernel param idlehalt is false and no other thread - * in the same core is active - if so, don't halt so that this - * core doesn't go into a low-power mode. - * For 4/4, we set a null "active cr3" while idle. - */ - if (my_core == NULL || my_cpu == NULL) - goto out; - - others_active = !atomic_decl_and_test( - (long *) &my_core->active_lcpus, 1); - my_cpu->lcpu.idle = TRUE; - if (idlehalt || others_active) { - DBGLOG(cpu_handle, cpu_number(), MP_IDLE); - MARK_CPU_IDLE(cpu_number()); - machine_idle_cstate(FALSE); - MARK_CPU_ACTIVE(cpu_number()); - DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE); - } - my_cpu->lcpu.idle = FALSE; - atomic_incl((long *) &my_core->active_lcpus, 1); - out: - __asm__ volatile("sti"); -} - void machine_signal_idle( processor_t processor) @@ -376,7 +343,7 @@ ml_processor_register( goto failed; if (!boot_cpu) { - this_cpu_datap->lcpu.core = cpu_thread_alloc(this_cpu_datap->cpu_number); + cpu_thread_alloc(this_cpu_datap->cpu_number); if (this_cpu_datap->lcpu.core == NULL) goto failed; @@ -526,7 +493,7 @@ ml_init_lock_timeout(void) LockTimeOut = (uint32_t) abstime; LockTimeOutTSC = (uint32_t) tmrCvt(abstime, tscFCvtn2t); - if (PE_parse_boot_arg("mtxspin", &mtxspin)) { + if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof (mtxspin))) { if (mtxspin > USEC_PER_SEC>>4) mtxspin = USEC_PER_SEC>>4; nanoseconds_to_absolutetime(mtxspin*NSEC_PER_USEC, &abstime); diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index d746ed642..cf0af4761 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -285,10 +285,10 @@ extern void ml_set_maxsnoop(uint32_t maxdelay); extern unsigned ml_get_maxsnoop(void); extern void ml_set_maxbusdelay(uint32_t mdelay); extern uint32_t ml_get_maxbusdelay(void); +extern void ml_set_maxintdelay(uint64_t mdelay); +extern uint64_t ml_get_maxintdelay(void); -extern void ml_hpet_cfg(uint32_t cpu, uint32_t hpetVect); - extern uint64_t tmrCvt(uint64_t time, uint64_t conversion); extern uint64_t ml_cpu_int_event_time(void); diff --git a/osfmk/i386/machine_routines_asm.s b/osfmk/i386/machine_routines_asm.s index b7187aecf..f68b81376 100644 --- a/osfmk/i386/machine_routines_asm.s +++ b/osfmk/i386/machine_routines_asm.s @@ -27,6 +27,7 @@ */ #include +#include #include #include @@ -47,6 +48,7 @@ ENTRY(ml_get_timebase) movl S_ARG0, %ecx rdtsc + lfence movl %edx, 0(%ecx) movl %eax, 4(%ecx) @@ -217,30 +219,7 @@ LEXT(_rtc_nanotime_read) jnz Lslow /* Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */ -0: - movl RNT_GENERATION(%edi),%esi /* get generation (0 if being changed) */ - testl %esi,%esi /* if being changed, loop until stable */ - jz 0b - - rdtsc /* get TSC in %edx:%eax */ - subl RNT_TSC_BASE(%edi),%eax - sbbl RNT_TSC_BASE+4(%edi),%edx - - movl RNT_SCALE(%edi),%ecx - - movl %edx,%ebx - mull %ecx - movl %ebx,%eax - movl %edx,%ebx - mull %ecx - addl %ebx,%eax - adcl $0,%edx - - addl RNT_NS_BASE(%edi),%eax - adcl RNT_NS_BASE+4(%edi),%edx - - cmpl RNT_GENERATION(%edi),%esi /* have the parameters changed? */ - jne 0b /* yes, loop until stable */ + RTC_NANOTIME_READ_FAST() popl %ebx popl %edi diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 23ce61860..507399783 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -55,7 +56,7 @@ #include #include #include -#include +#include #include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #include #include @@ -97,20 +97,8 @@ #define PAUSE #endif /* MP_DEBUG */ -/* Initialize lapic_id so cpu_number() works on non SMP systems */ -unsigned long lapic_id_initdata = 0; -unsigned long lapic_id = (unsigned long)&lapic_id_initdata; -vm_offset_t lapic_start; - -static i386_intr_func_t lapic_timer_func; -static i386_intr_func_t lapic_pmi_func; -static i386_intr_func_t lapic_thermal_func; - -/* TRUE if local APIC was enabled by the OS not by the BIOS */ -static boolean_t lapic_os_enabled = FALSE; - -/* Base vector for local APIC interrupt sources */ -int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; +#define FULL_SLAVE_INIT (NULL) +#define FAST_SLAVE_INIT ((void *)(uintptr_t)1) void slave_boot_init(void); @@ -124,8 +112,9 @@ static void mp_kdp_wait(boolean_t flush); static void mp_rendezvous_action(void); static void mp_broadcast_action(void); -static int NMIInterruptHandler(x86_saved_state_t *regs); static boolean_t cpu_signal_pending(int cpu, mp_event_t event); +static int cpu_signal_handler(x86_saved_state_t *regs); +static int NMIInterruptHandler(x86_saved_state_t *regs); boolean_t smp_initialized = FALSE; volatile boolean_t force_immediate_debugger_NMI = FALSE; @@ -150,77 +139,13 @@ static volatile long mp_rv_complete __attribute__((aligned(64))); /* Variables needed for MP broadcast. */ static void (*mp_bc_action_func)(void *arg); static void *mp_bc_func_arg; -static int mp_bc_ncpus; +static int mp_bc_ncpus; static volatile long mp_bc_count; decl_mutex_data(static, mp_bc_lock); +static volatile int debugger_cpu = -1; static void mp_cpus_call_action(void); -int lapic_to_cpu[MAX_CPUS]; -int cpu_to_lapic[MAX_CPUS]; - -static void -lapic_cpu_map_init(void) -{ - int i; - - for (i = 0; i < MAX_CPUS; i++) { - lapic_to_cpu[i] = -1; - cpu_to_lapic[i] = -1; - } -} - -void -lapic_cpu_map(int apic_id, int cpu) -{ - cpu_to_lapic[cpu] = apic_id; - lapic_to_cpu[apic_id] = cpu; -} - -/* - * Retrieve the local apic ID a cpu. - * - * Returns the local apic ID for the given processor. - * If the processor does not exist or apic not configured, returns -1. - */ - -uint32_t -ml_get_apicid(uint32_t cpu) -{ - if(cpu >= (uint32_t)MAX_CPUS) - return 0xFFFFFFFF; /* Return -1 if cpu too big */ - - /* Return the apic ID (or -1 if not configured) */ - return (uint32_t)cpu_to_lapic[cpu]; - -} - -#ifdef MP_DEBUG -static void -lapic_cpu_map_dump(void) -{ - int i; - - for (i = 0; i < MAX_CPUS; i++) { - if (cpu_to_lapic[i] == -1) - continue; - kprintf("cpu_to_lapic[%d]: %d\n", - i, cpu_to_lapic[i]); - } - for (i = 0; i < MAX_CPUS; i++) { - if (lapic_to_cpu[i] == -1) - continue; - kprintf("lapic_to_cpu[%d]: %d\n", - i, lapic_to_cpu[i]); - } -} -#define LAPIC_CPU_MAP_DUMP() lapic_cpu_map_dump() -#define LAPIC_DUMP() lapic_dump() -#else -#define LAPIC_CPU_MAP_DUMP() -#define LAPIC_DUMP() -#endif /* MP_DEBUG */ - #if GPROF /* * Initialize dummy structs for profiling. These aren't used but @@ -243,14 +168,6 @@ struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars }; void smp_init(void) { - int result; - vm_map_entry_t entry; - uint32_t lo; - uint32_t hi; - boolean_t is_boot_processor; - boolean_t is_lapic_enabled; - vm_offset_t lapic_base; - simple_lock_init(&mp_kdp_lock, 0); simple_lock_init(&mp_rv_lock, 0); mutex_init(&mp_cpu_boot_lock, 0); @@ -261,48 +178,10 @@ smp_init(void) if (!lapic_probe()) return; - /* Examine the local APIC state */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - is_boot_processor = (lo & MSR_IA32_APIC_BASE_BSP) != 0; - is_lapic_enabled = (lo & MSR_IA32_APIC_BASE_ENABLE) != 0; - lapic_base = (lo & MSR_IA32_APIC_BASE_BASE); - kprintf("MSR_IA32_APIC_BASE 0x%x %s %s\n", lapic_base, - is_lapic_enabled ? "enabled" : "disabled", - is_boot_processor ? "BSP" : "AP"); - if (!is_boot_processor || !is_lapic_enabled) - panic("Unexpected local APIC state\n"); - - /* Establish a map to the local apic */ - lapic_start = vm_map_min(kernel_map); - result = vm_map_find_space(kernel_map, - (vm_map_address_t *) &lapic_start, - round_page(LAPIC_SIZE), 0, - VM_MAKE_TAG(VM_MEMORY_IOKIT), &entry); - if (result != KERN_SUCCESS) { - panic("smp_init: vm_map_find_entry FAILED (err=%d)", result); - } - vm_map_unlock(kernel_map); -/* Map in the local APIC non-cacheable, as recommended by Intel - * in section 8.4.1 of the "System Programming Guide". - */ - pmap_enter(pmap_kernel(), - lapic_start, - (ppnum_t) i386_btop(lapic_base), - VM_PROT_READ|VM_PROT_WRITE, - VM_WIMG_IO, - TRUE); - lapic_id = (unsigned long)(lapic_start + LAPIC_ID); - - if ((LAPIC_REG(VERSION)&LAPIC_VERSION_MASK) != 0x14) { - printf("Local APIC version not 0x14 as expected\n"); - } - - /* Set up the lapic_id <-> cpu_number map and add this boot processor */ - lapic_cpu_map_init(); - lapic_cpu_map((LAPIC_REG(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, 0); - kprintf("Boot cpu local APIC id 0x%x\n", cpu_to_lapic[0]); - lapic_init(); + lapic_configure(); + lapic_set_intr_func(LAPIC_NMI_INTERRUPT, NMIInterruptHandler); + lapic_set_intr_func(LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler); cpu_thread_init(); @@ -316,420 +195,17 @@ smp_init(void) return; } - -static int -lapic_esr_read(void) -{ - /* write-read register */ - LAPIC_REG(ERROR_STATUS) = 0; - return LAPIC_REG(ERROR_STATUS); -} - -static void -lapic_esr_clear(void) -{ - LAPIC_REG(ERROR_STATUS) = 0; - LAPIC_REG(ERROR_STATUS) = 0; -} - -static const char *DM[8] = { - "Fixed", - "Lowest Priority", - "Invalid", - "Invalid", - "NMI", - "Reset", - "Invalid", - "ExtINT"}; - -void -lapic_dump(void) -{ - int i; - -#define BOOL(a) ((a)?' ':'!') - - kprintf("LAPIC %d at 0x%x version 0x%x\n", - (LAPIC_REG(ID)>>LAPIC_ID_SHIFT)&LAPIC_ID_MASK, - lapic_start, - LAPIC_REG(VERSION)&LAPIC_VERSION_MASK); - kprintf("Priorities: Task 0x%x Arbitration 0x%x Processor 0x%x\n", - LAPIC_REG(TPR)&LAPIC_TPR_MASK, - LAPIC_REG(APR)&LAPIC_APR_MASK, - LAPIC_REG(PPR)&LAPIC_PPR_MASK); - kprintf("Destination Format 0x%x Logical Destination 0x%x\n", - LAPIC_REG(DFR)>>LAPIC_DFR_SHIFT, - LAPIC_REG(LDR)>>LAPIC_LDR_SHIFT); - kprintf("%cEnabled %cFocusChecking SV 0x%x\n", - BOOL(LAPIC_REG(SVR)&LAPIC_SVR_ENABLE), - BOOL(!(LAPIC_REG(SVR)&LAPIC_SVR_FOCUS_OFF)), - LAPIC_REG(SVR) & LAPIC_SVR_MASK); - kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", - LAPIC_REG(LVT_TIMER)&LAPIC_LVT_VECTOR_MASK, - (LAPIC_REG(LVT_TIMER)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_TIMER)&LAPIC_LVT_MASKED), - (LAPIC_REG(LVT_TIMER)&LAPIC_LVT_PERIODIC)?"Periodic":"OneShot"); - kprintf(" Initial Count: 0x%08x \n", LAPIC_REG(TIMER_INITIAL_COUNT)); - kprintf(" Current Count: 0x%08x \n", LAPIC_REG(TIMER_CURRENT_COUNT)); - kprintf(" Divide Config: 0x%08x \n", LAPIC_REG(TIMER_DIVIDE_CONFIG)); - kprintf("LVT_PERFCNT: Vector 0x%02x [%s] %s %cmasked\n", - LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_PERFCNT)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_PERFCNT)&LAPIC_LVT_MASKED)); - kprintf("LVT_THERMAL: Vector 0x%02x [%s] %s %cmasked\n", - LAPIC_REG(LVT_THERMAL)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_THERMAL)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_THERMAL)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_THERMAL)&LAPIC_LVT_MASKED)); - kprintf("LVT_LINT0: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - LAPIC_REG(LVT_LINT0)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_LINT0)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_TM_LEVEL)?"Level":"Edge ", - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_IP_PLRITY_LOW)?"Low ":"High", - (LAPIC_REG(LVT_LINT0)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_LINT0)&LAPIC_LVT_MASKED)); - kprintf("LVT_LINT1: Vector 0x%02x [%s][%s][%s] %s %cmasked\n", - LAPIC_REG(LVT_LINT1)&LAPIC_LVT_VECTOR_MASK, - DM[(LAPIC_REG(LVT_LINT1)>>LAPIC_LVT_DM_SHIFT)&LAPIC_LVT_DM_MASK], - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_TM_LEVEL)?"Level":"Edge ", - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_IP_PLRITY_LOW)?"Low ":"High", - (LAPIC_REG(LVT_LINT1)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_LINT1)&LAPIC_LVT_MASKED)); - kprintf("LVT_ERROR: Vector 0x%02x %s %cmasked\n", - LAPIC_REG(LVT_ERROR)&LAPIC_LVT_VECTOR_MASK, - (LAPIC_REG(LVT_ERROR)&LAPIC_LVT_DS_PENDING)?"SendPending":"Idle", - BOOL(LAPIC_REG(LVT_ERROR)&LAPIC_LVT_MASKED)); - kprintf("ESR: %08x \n", lapic_esr_read()); - kprintf(" "); - for(i=0xf; i>=0; i--) - kprintf("%x%x%x%x",i,i,i,i); - kprintf("\n"); - kprintf("TMR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(TMR_BASE, i*0x10)); - kprintf("\n"); - kprintf("IRR: 0x"); - for(i=7; i>=0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(IRR_BASE, i*0x10)); - kprintf("\n"); - kprintf("ISR: 0x"); - for(i=7; i >= 0; i--) - kprintf("%08x",LAPIC_REG_OFFSET(ISR_BASE, i*0x10)); - kprintf("\n"); -} - -#if MACH_KDB /* - * Displays apic junk - * - * da + * Poll a CPU to see when it has marked itself as running. */ -void -db_apic(__unused db_expr_t addr, - __unused int have_addr, - __unused db_expr_t count, - __unused char *modif) -{ - - lapic_dump(); - - return; -} - -#endif - -boolean_t -lapic_probe(void) -{ - uint32_t lo; - uint32_t hi; - - if (cpuid_features() & CPUID_FEATURE_APIC) - return TRUE; - - if (cpuid_family() == 6 || cpuid_family() == 15) { - /* - * Mobile Pentiums: - * There may be a local APIC which wasn't enabled by BIOS. - * So we try to enable it explicitly. - */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_BASE; - lo |= MSR_IA32_APIC_BASE_ENABLE | LAPIC_START; - lo |= MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); - - /* - * Re-initialize cpu features info and re-check. - */ - cpuid_set_info(); - if (cpuid_features() & CPUID_FEATURE_APIC) { - printf("Local APIC discovered and enabled\n"); - lapic_os_enabled = TRUE; - lapic_interrupt_base = LAPIC_REDUCED_INTERRUPT_BASE; - return TRUE; - } - } - - return FALSE; -} - -void -lapic_shutdown(void) -{ - uint32_t lo; - uint32_t hi; - uint32_t value; - - /* Shutdown if local APIC was enabled by OS */ - if (lapic_os_enabled == FALSE) - return; - - mp_disable_preemption(); - - /* ExtINT: masked */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_REG(LVT_LINT0); - value |= LAPIC_LVT_MASKED; - LAPIC_REG(LVT_LINT0) = value; - } - - /* Timer: masked */ - LAPIC_REG(LVT_TIMER) |= LAPIC_LVT_MASKED; - - /* Perfmon: masked */ - LAPIC_REG(LVT_PERFCNT) |= LAPIC_LVT_MASKED; - - /* Error: masked */ - LAPIC_REG(LVT_ERROR) |= LAPIC_LVT_MASKED; - - /* APIC software disabled */ - LAPIC_REG(SVR) &= ~LAPIC_SVR_ENABLE; - - /* Bypass the APIC completely and update cpu features */ - rdmsr(MSR_IA32_APIC_BASE, lo, hi); - lo &= ~MSR_IA32_APIC_BASE_ENABLE; - wrmsr(MSR_IA32_APIC_BASE, lo, hi); - cpuid_set_info(); - - mp_enable_preemption(); -} - -void -lapic_init(void) -{ - int value; - - /* Set flat delivery model, logical processor id */ - LAPIC_REG(DFR) = LAPIC_DFR_FLAT; - LAPIC_REG(LDR) = (get_cpu_number()) << LAPIC_LDR_SHIFT; - - /* Accept all */ - LAPIC_REG(TPR) = 0; - - LAPIC_REG(SVR) = LAPIC_VECTOR(SPURIOUS) | LAPIC_SVR_ENABLE; - - /* ExtINT */ - if (get_cpu_number() == master_cpu) { - value = LAPIC_REG(LVT_LINT0); - value &= ~LAPIC_LVT_MASKED; - value |= LAPIC_LVT_DM_EXTINT; - LAPIC_REG(LVT_LINT0) = value; - } - - /* Timer: unmasked, one-shot */ - LAPIC_REG(LVT_TIMER) = LAPIC_VECTOR(TIMER); - - /* Perfmon: unmasked */ - LAPIC_REG(LVT_PERFCNT) = LAPIC_VECTOR(PERFCNT); - - /* Thermal: unmasked */ - LAPIC_REG(LVT_THERMAL) = LAPIC_VECTOR(THERMAL); - - lapic_esr_clear(); - - LAPIC_REG(LVT_ERROR) = LAPIC_VECTOR(ERROR); -} - -void -lapic_set_timer_func(i386_intr_func_t func) -{ - lapic_timer_func = func; -} - -void -lapic_set_timer( - boolean_t interrupt, - lapic_timer_mode_t mode, - lapic_timer_divide_t divisor, - lapic_timer_count_t initial_count) -{ - boolean_t state; - uint32_t timer_vector; - - state = ml_set_interrupts_enabled(FALSE); - timer_vector = LAPIC_REG(LVT_TIMER); - timer_vector &= ~(LAPIC_LVT_MASKED|LAPIC_LVT_PERIODIC);; - timer_vector |= interrupt ? 0 : LAPIC_LVT_MASKED; - timer_vector |= (mode == periodic) ? LAPIC_LVT_PERIODIC : 0; - LAPIC_REG(LVT_TIMER) = timer_vector; - LAPIC_REG(TIMER_DIVIDE_CONFIG) = divisor; - LAPIC_REG(TIMER_INITIAL_COUNT) = initial_count; - ml_set_interrupts_enabled(state); -} - -void -lapic_get_timer( - lapic_timer_mode_t *mode, - lapic_timer_divide_t *divisor, - lapic_timer_count_t *initial_count, - lapic_timer_count_t *current_count) -{ - boolean_t state; - - state = ml_set_interrupts_enabled(FALSE); - if (mode) - *mode = (LAPIC_REG(LVT_TIMER) & LAPIC_LVT_PERIODIC) ? - periodic : one_shot; - if (divisor) - *divisor = LAPIC_REG(TIMER_DIVIDE_CONFIG) & LAPIC_TIMER_DIVIDE_MASK; - if (initial_count) - *initial_count = LAPIC_REG(TIMER_INITIAL_COUNT); - if (current_count) - *current_count = LAPIC_REG(TIMER_CURRENT_COUNT); - ml_set_interrupts_enabled(state); -} - -void -lapic_set_pmi_func(i386_intr_func_t func) -{ - lapic_pmi_func = func; -} - -void -lapic_set_thermal_func(i386_intr_func_t func) -{ - lapic_thermal_func = func; -} - -static inline void -_lapic_end_of_interrupt(void) -{ - LAPIC_REG(EOI) = 0; -} - -void -lapic_end_of_interrupt(void) -{ - _lapic_end_of_interrupt(); -} - -int -lapic_interrupt(int interrupt, x86_saved_state_t *state) -{ - int retval = 0; - - /* Did we just field an interruption for the HPET comparator? */ - if(x86_core()->HpetVec == ((uint32_t)interrupt - 0x40)) { - /* Yes, go handle it... */ - retval = HPETInterrupt(); - /* Was it really handled? */ - if(retval) { - /* If so, EOI the 'rupt */ - _lapic_end_of_interrupt(); - /* - * and then leave, - * indicating that this has been handled - */ - return 1; - } - } - - interrupt -= lapic_interrupt_base; - if (interrupt < 0) { - if (interrupt == (LAPIC_NMI_INTERRUPT - lapic_interrupt_base)) { - retval = NMIInterruptHandler(state); - _lapic_end_of_interrupt(); - return retval; - } - else - return 0; - } - - switch(interrupt) { - case LAPIC_PERFCNT_INTERRUPT: - if (lapic_pmi_func != NULL) - (*lapic_pmi_func)(NULL); - /* Clear interrupt masked */ - LAPIC_REG(LVT_PERFCNT) = LAPIC_VECTOR(PERFCNT); - _lapic_end_of_interrupt(); - retval = 1; - break; - case LAPIC_TIMER_INTERRUPT: - _lapic_end_of_interrupt(); - if (lapic_timer_func != NULL) - (*lapic_timer_func)(state); - retval = 1; - break; - case LAPIC_THERMAL_INTERRUPT: - if (lapic_thermal_func != NULL) - (*lapic_thermal_func)(NULL); - _lapic_end_of_interrupt(); - retval = 1; - break; - case LAPIC_ERROR_INTERRUPT: - lapic_dump(); - panic("Local APIC error\n"); - _lapic_end_of_interrupt(); - retval = 1; - break; - case LAPIC_SPURIOUS_INTERRUPT: - kprintf("SPIV\n"); - /* No EOI required here */ - retval = 1; - break; - case LAPIC_INTERPROCESSOR_INTERRUPT: - _lapic_end_of_interrupt(); - cpu_signal_handler(state); - retval = 1; - break; - } - - return retval; -} - -void -lapic_smm_restore(void) +static void +mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay) { - boolean_t state; - - if (lapic_os_enabled == FALSE) - return; - - state = ml_set_interrupts_enabled(FALSE); - - if (LAPIC_ISR_IS_SET(LAPIC_REDUCED_INTERRUPT_BASE, TIMER)) { - /* - * Bogus SMI handler enables interrupts but does not know about - * local APIC interrupt sources. When APIC timer counts down to - * zero while in SMM, local APIC will end up waiting for an EOI - * but no interrupt was delivered to the OS. - */ - _lapic_end_of_interrupt(); - - /* - * timer is one-shot, trigger another quick countdown to trigger - * another timer interrupt. - */ - if (LAPIC_REG(TIMER_CURRENT_COUNT) == 0) { - LAPIC_REG(TIMER_INITIAL_COUNT) = 1; - } - - kprintf("lapic_smm_restore\n"); + while (iters-- > 0) { + if (cpu_datap(slot_num)->cpu_running) + break; + delay(usecdelay); } - - ml_set_interrupts_enabled(state); } kern_return_t @@ -766,27 +242,23 @@ intel_startCPU( return KERN_SUCCESS; } - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_INIT; + LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); delay(10000); - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12); + LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); delay(200); - LAPIC_REG(ICRD) = lapic << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12); + LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); delay(200); #ifdef POSTCODE_DELAY /* Wait much longer if postcodes are displayed for a delay period. */ i *= 10000; #endif - while(i-- > 0) { - if (cpu_datap(slot_num)->cpu_running) - break; - delay(10000); - } + mp_wait_for_cpu_up(slot_num, i, 10000); mp_enable_preemption(); mutex_unlock(&mp_cpu_boot_lock); @@ -803,6 +275,47 @@ intel_startCPU( } } +/* + * Quickly bring a CPU back online which has been halted. + */ +kern_return_t +intel_startCPU_fast(int slot_num) +{ + kern_return_t rc; + + /* + * Try to perform a fast restart + */ + rc = pmCPUExitHalt(slot_num); + if (rc != KERN_SUCCESS) + /* + * The CPU was not eligible for a fast restart. + */ + return(rc); + + /* + * Wait until the CPU is back online. + */ + mp_disable_preemption(); + + /* + * We use short pauses (1us) for low latency. 30,000 iterations is + * longer than a full restart would require so it should be more + * than long enough. + */ + mp_wait_for_cpu_up(slot_num, 30000, 1); + mp_enable_preemption(); + + /* + * Check to make sure that the CPU is really running. If not, + * go through the slow path. + */ + if (cpu_datap(slot_num)->cpu_running) + return(KERN_SUCCESS); + else + return(KERN_FAILURE); +} + extern char slave_boot_base[]; extern char slave_boot_end[]; extern void slave_pstart(void); @@ -854,7 +367,7 @@ MP_EVENT_NAME_DECL(); #endif /* MP_DEBUG */ -void +int cpu_signal_handler(x86_saved_state_t *regs) { int my_cpu; @@ -924,11 +437,10 @@ cpu_signal_handler(x86_saved_state_t *regs) mp_enable_preemption(); + return 0; } -/* We want this to show up in backtraces, hence marked noinline. - */ -static int __attribute__((noinline)) +static int NMIInterruptHandler(x86_saved_state_t *regs) { void *stackptr; @@ -936,8 +448,13 @@ NMIInterruptHandler(x86_saved_state_t *regs) sync_iss_to_iks_unconditionally(regs); __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); + if (cpu_number() == debugger_cpu) + goto NMExit; + if (pmap_tlb_flush_timeout == TRUE && current_cpu_datap()->cpu_tlb_invalid) { - panic_i386_backtrace(stackptr, 10, "Panic: Unresponsive processor\n", TRUE, regs); + char pstr[128]; + snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor\n", cpu_number()); + panic_i386_backtrace(stackptr, 10, &pstr[0], TRUE, regs); panic_io_port_read(); mca_check_save(); if (pmsafe_debug) @@ -947,11 +464,12 @@ NMIInterruptHandler(x86_saved_state_t *regs) } } mp_kdp_wait(FALSE); +NMExit: return 1; } #ifdef MP_DEBUG -extern int max_lock_loops; +int max_lock_loops = 1000000; int trappedalready = 0; /* (BRINGUP */ #endif /* MP_DEBUG */ @@ -981,20 +499,18 @@ i386_cpu_IPI(int cpu) /* Wait for previous interrupt to be delivered... */ #ifdef MP_DEBUG int pending_busy_count = 0; - while (LAPIC_REG(ICR) & LAPIC_ICR_DS_PENDING) { + while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { if (++pending_busy_count > max_lock_loops) panic("i386_cpu_IPI() deadlock\n"); #else - while (LAPIC_REG(ICR) & LAPIC_ICR_DS_PENDING) { + while (LAPIC_READ(ICR) & LAPIC_ICR_DS_PENDING) { #endif /* MP_DEBUG */ cpu_pause(); } state = ml_set_interrupts_enabled(FALSE); - LAPIC_REG(ICRD) = - cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT; - LAPIC_REG(ICR) = - LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_FIXED; + LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_FIXED); (void) ml_set_interrupts_enabled(state); } @@ -1023,13 +539,11 @@ cpu_NMI_interrupt(int cpu) if (smp_initialized) { state = ml_set_interrupts_enabled(FALSE); /* Program the interrupt command register */ - LAPIC_REG(ICRD) = - cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT; + LAPIC_WRITE(ICRD, cpu_to_lapic[cpu] << LAPIC_ICRD_DEST_SHIFT); /* The vector is ignored in this case--the target CPU will enter on the * NMI vector. */ - LAPIC_REG(ICR) = - LAPIC_VECTOR(INTERPROCESSOR) | LAPIC_ICR_DM_NMI; + LAPIC_WRITE(ICR, LAPIC_VECTOR(INTERPROCESSOR)|LAPIC_ICR_DM_NMI); (void) ml_set_interrupts_enabled(state); } } @@ -1491,7 +1005,7 @@ int pmsafe_debug = 1; #if MACH_KDP volatile boolean_t mp_kdp_trap = FALSE; -volatile unsigned long mp_kdp_ncpus; +volatile unsigned long mp_kdp_ncpus; boolean_t mp_kdp_state; @@ -1500,7 +1014,7 @@ mp_kdp_enter(void) { unsigned int cpu; unsigned int ncpus; - unsigned int my_cpu = cpu_number(); + unsigned int my_cpu; uint64_t tsc_timeout; DBG("mp_kdp_enter()\n"); @@ -1522,6 +1036,8 @@ mp_kdp_enter(void) mp_kdp_wait(TRUE); simple_lock(&mp_kdp_lock); } + my_cpu = cpu_number(); + debugger_cpu = my_cpu; mp_kdp_ncpus = 1; /* self */ mp_kdp_trap = TRUE; simple_unlock(&mp_kdp_lock); @@ -1634,6 +1150,7 @@ void mp_kdp_exit(void) { DBG("mp_kdp_exit()\n"); + debugger_cpu = -1; atomic_decl((volatile long *)&mp_kdp_ncpus, 1); mp_kdp_trap = FALSE; __asm__ volatile("mfence"); @@ -1776,62 +1293,91 @@ mp_kdb_exit(void) #endif /* MACH_KDB */ -/* - * i386_init_slave() is called from pstart. - * We're in the cpu's interrupt stack with interrupts disabled. - * At this point we are in legacy mode. We need to switch on IA32e - * if the mode is set to 64-bits. - */ -void -i386_init_slave(void) +static void +do_init_slave(boolean_t fast_restart) { + void *init_param = FULL_SLAVE_INIT; + postcode(I386_INIT_SLAVE); - /* Ensure that caching and write-through are enabled */ - set_cr0(get_cr0() & ~(CR0_NW|CR0_CD)); + if (!fast_restart) { + /* Ensure that caching and write-through are enabled */ + set_cr0(get_cr0() & ~(CR0_NW|CR0_CD)); - DBG("i386_init_slave() CPU%d: phys (%d) active.\n", - get_cpu_number(), get_cpu_phys_number()); + DBG("i386_init_slave() CPU%d: phys (%d) active.\n", + get_cpu_number(), get_cpu_phys_number()); - assert(!ml_get_interrupts_enabled()); + assert(!ml_get_interrupts_enabled()); - cpu_mode_init(current_cpu_datap()); + cpu_mode_init(current_cpu_datap()); - mca_cpu_init(); + mca_cpu_init(); - lapic_init(); - LAPIC_DUMP(); - LAPIC_CPU_MAP_DUMP(); + lapic_configure(); + LAPIC_DUMP(); + LAPIC_CPU_MAP_DUMP(); - init_fpu(); + init_fpu(); - mtrr_update_cpu(); + mtrr_update_cpu(); + } else + init_param = FAST_SLAVE_INIT; /* resume VT operation */ vmx_resume(); - pat_init(); + if (!fast_restart) + pat_init(); cpu_thread_init(); /* not strictly necessary */ cpu_init(); /* Sets cpu_running which starter cpu waits for */ - slave_main(); + slave_main(init_param); - panic("i386_init_slave() returned from slave_main()"); + panic("do_init_slave() returned from slave_main()"); } +/* + * i386_init_slave() is called from pstart. + * We're in the cpu's interrupt stack with interrupts disabled. + * At this point we are in legacy mode. We need to switch on IA32e + * if the mode is set to 64-bits. + */ void -slave_machine_init(void) +i386_init_slave(void) +{ + do_init_slave(FALSE); +} + +/* + * i386_init_slave_fast() is called from pmCPUHalt. + * We're running on the idle thread and need to fix up + * some accounting and get it so that the scheduler sees this + * CPU again. + */ +void +i386_init_slave_fast(void) +{ + do_init_slave(TRUE); +} + +void +slave_machine_init(void *param) { /* * Here in process context, but with interrupts disabled. */ DBG("slave_machine_init() CPU%d\n", get_cpu_number()); - clock_init(); + if (param == FULL_SLAVE_INIT) { + /* + * Cold start + */ + clock_init(); - cpu_machine_init(); /* Interrupts enabled hereafter */ + cpu_machine_init(); /* Interrupts enabled hereafter */ + } } #undef cpu_number() diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 99ba34fe2..0fac0fbd5 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,8 +58,8 @@ */ #ifdef KERNEL_PRIVATE -#ifndef _I386AT_MP_H_ -#define _I386AT_MP_H_ +#ifndef _I386_MP_H_ +#define _I386_MP_H_ #ifndef DEBUG #include @@ -69,114 +69,27 @@ #include #include -#define LAPIC_ID_MAX (LAPIC_ID_MASK) - -#define MAX_CPUS (LAPIC_ID_MAX + 1) +#define MAX_CPUS 32 /* (8*sizeof(long)) */ #ifndef ASSEMBLER +#include #include #include #include #include +#include __BEGIN_DECLS extern kern_return_t intel_startCPU(int slot_num); +extern kern_return_t intel_startCPU_fast(int slot_num); extern void i386_init_slave(void); +extern void i386_init_slave_fast(void); extern void smp_init(void); extern void cpu_interrupt(int cpu); - -extern void lapic_init(void); -extern void lapic_shutdown(void); -extern void lapic_smm_restore(void); -extern boolean_t lapic_probe(void); -extern void lapic_dump(void); -extern int lapic_interrupt(int interrupt, x86_saved_state_t *state); -extern void lapic_end_of_interrupt(void); -extern int lapic_to_cpu[]; -extern int cpu_to_lapic[]; -extern int lapic_interrupt_base; -extern void lapic_cpu_map(int lapic, int cpu_num); -extern uint32_t ml_get_apicid(uint32_t cpu); - -extern void lapic_set_timer( - boolean_t interrupt, - lapic_timer_mode_t mode, - lapic_timer_divide_t divisor, - lapic_timer_count_t initial_count); - -extern void lapic_get_timer( - lapic_timer_mode_t *mode, - lapic_timer_divide_t *divisor, - lapic_timer_count_t *initial_count, - lapic_timer_count_t *current_count); - -typedef void (*i386_intr_func_t)(void *); -extern void lapic_set_timer_func(i386_intr_func_t func); -extern void lapic_set_pmi_func(i386_intr_func_t func); -extern void lapic_set_thermal_func(i386_intr_func_t func); - __END_DECLS -/* - * By default, use high vectors to leave vector space for systems - * with multiple I/O APIC's. However some systems that boot with - * local APIC disabled will hang in SMM when vectors greater than - * 0x5F are used. Those systems are not expected to have I/O APIC - * so 16 (0x50 - 0x40) vectors for legacy PIC support is perfect. - */ -#define LAPIC_DEFAULT_INTERRUPT_BASE 0xD0 -#define LAPIC_REDUCED_INTERRUPT_BASE 0x50 -/* - * Specific lapic interrupts are relative to this base - * in priority order from high to low: - */ - -#define LAPIC_PERFCNT_INTERRUPT 0xF -#define LAPIC_TIMER_INTERRUPT 0xE -#define LAPIC_INTERPROCESSOR_INTERRUPT 0xD -#define LAPIC_THERMAL_INTERRUPT 0xC -#define LAPIC_ERROR_INTERRUPT 0xB -#define LAPIC_SPURIOUS_INTERRUPT 0xA -/* The vector field is ignored for NMI interrupts via the LAPIC - * or otherwise, so this is not an offset from the interrupt - * base. - */ -#define LAPIC_NMI_INTERRUPT 0x2 - -#define LAPIC_REG(reg) \ - (*((volatile uint32_t *)(lapic_start + LAPIC_##reg))) -#define LAPIC_REG_OFFSET(reg,off) \ - (*((volatile uint32_t *)(lapic_start + LAPIC_##reg + (off)))) - -#define LAPIC_VECTOR(src) \ - (lapic_interrupt_base + LAPIC_##src##_INTERRUPT) - -#define LAPIC_ISR_IS_SET(base,src) \ - (LAPIC_REG_OFFSET(ISR_BASE,((base+LAPIC_##src##_INTERRUPT)/32)*0x10) & \ - (1 <<((base + LAPIC_##src##_INTERRUPT)%32))) - -extern vm_offset_t lapic_start; - -#endif /* ASSEMBLER */ - -#define CPU_NUMBER(r) \ - movl %gs:CPU_NUMBER_GS,r - -#define CPU_NUMBER_FROM_LAPIC(r) \ - movl EXT(lapic_id),r; \ - movl 0(r),r; \ - shrl $(LAPIC_ID_SHIFT),r; \ - andl $(LAPIC_ID_MASK),r; \ - movl EXT(lapic_to_cpu)(,r,4),r - - -/* word describing the reason for the interrupt, one per cpu */ - -#ifndef ASSEMBLER -#include - extern unsigned int real_ncpus; /* real number of cpus */ extern unsigned int max_ncpus; /* max number of cpus */ decl_simple_lock_data(extern,kdb_lock) /* kdb lock */ @@ -425,6 +338,6 @@ extern cpu_signal_event_log_t *cpu_handle[]; #define MP_ENABLE_PREEMPTION_NO_CHECK #endif /* MACH_RT */ -#endif /* _I386AT_MP_H_ */ +#endif /* _I386_MP_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 75bbe25cf..9e2df152e 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -94,7 +94,12 @@ extern uint32_t low_eintstack[]; /* top */ * The master cpu (cpu 0) has its data area statically allocated; * others are allocated dynamically and this array is updated at runtime. */ -cpu_data_t cpu_data_master; +cpu_data_t cpu_data_master = { + .cpu_this = &cpu_data_master, + .cpu_nanotime = &rtc_nanotime_info, + .cpu_is64bit = FALSE, + .cpu_int_stack_top = (vm_offset_t) low_eintstack, + }; cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] &cpu_data_master }; decl_simple_lock_data(,cpu_lock); /* protects real_ncpus */ @@ -488,9 +493,6 @@ cpu_data_alloc(boolean_t is_boot_cpu) if (cdp->cpu_processor == NULL) { cdp->cpu_processor = cpu_processor_alloc(TRUE); cdp->cpu_pmap = pmap_cpu_alloc(TRUE); - cdp->cpu_this = cdp; - cdp->cpu_is64bit = FALSE; - cdp->cpu_int_stack_top = (vm_offset_t) low_eintstack; cpu_desc_init(cdp, TRUE); fast_syscall_init(); } @@ -566,6 +568,8 @@ cpu_data_alloc(boolean_t is_boot_cpu) real_ncpus++; simple_unlock(&cpu_lock); + cdp->cpu_nanotime = &rtc_nanotime_info; + kprintf("cpu_data_alloc(%d) %p desc_table: %p " "ldt: %p " "int_stack: 0x%x-0x%x\n", diff --git a/osfmk/i386/perfmon.c b/osfmk/i386/perfmon.c index 61e107963..1dffe6d59 100644 --- a/osfmk/i386/perfmon.c +++ b/osfmk/i386/perfmon.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -260,7 +260,7 @@ _pmc_machine_type(void) static void pmc_p4_intr(void *state) { - pmc_table_t *pmc_table = (pmc_table_t *) x86_core()->pmc; + pmc_table_t *pmc_table = (pmc_table_t *) x86_lcpu()->pmc; uint32_t cccr_addr; pmc_cccr_t cccr; pmc_id_t id; @@ -300,7 +300,7 @@ pmc_p4_intr(void *state) static void pmc_p6_intr(void *state) { - pmc_table_t *pmc_table = (pmc_table_t *) x86_core()->pmc; + pmc_table_t *pmc_table = (pmc_table_t *) x86_lcpu()->pmc; pmc_id_t id; /* @@ -315,7 +315,7 @@ pmc_p6_intr(void *state) static void pmc_core_intr(void *state) { - pmc_table_t *pmc_table = (pmc_table_t *) x86_core()->pmc; + pmc_table_t *pmc_table = (pmc_table_t *) x86_lcpu()->pmc; pmc_id_t id; pmc_global_status_t ovf_status; @@ -367,7 +367,7 @@ pmc_alloc(void) pmc_table->id_max = 17; pmc_table->msr_counter_base = MSR_COUNTER_ADDR(0); pmc_table->msr_control_base = MSR_CCCR_ADDR(0); - lapic_set_pmi_func(&pmc_p4_intr); + lapic_set_pmi_func((i386_intr_func_t) &pmc_p4_intr); break; case pmc_Core: pmc_table->id_max = 1; @@ -376,13 +376,13 @@ pmc_alloc(void) pmc_table->Core.msr_global_ctrl = MSR_PERF_GLOBAL_CTRL; pmc_table->Core.msr_global_ovf_ctrl = MSR_PERF_GLOBAL_OVF_CTRL; pmc_table->Core.msr_global_status = MSR_PERF_GLOBAL_STATUS; - lapic_set_pmi_func(&pmc_core_intr); + lapic_set_pmi_func((i386_intr_func_t) &pmc_core_intr); break; case pmc_P6: pmc_table->id_max = 1; pmc_table->msr_counter_base = MSR_P6_COUNTER_ADDR(0); pmc_table->msr_control_base = MSR_P6_PES_ADDR(0); - lapic_set_pmi_func(&pmc_p6_intr); + lapic_set_pmi_func((i386_intr_func_t) &pmc_p6_intr); break; default: break; @@ -398,12 +398,12 @@ pmc_alloc(void) static inline pmc_table_t * pmc_table_valid(pmc_id_t id) { - x86_core_t *my_core = x86_core(); + x86_lcpu_t *my_lcpu = x86_lcpu(); pmc_table_t *pmc; - assert(my_core != NULL); + assert(my_lcpu != NULL); - pmc = (pmc_table_t *) my_core->pmc; + pmc = (pmc_table_t *) my_lcpu->pmc; if ((pmc == NULL) || (id > pmc->id_max) || (pmc->machine_type == pmc_P4_Xeon && !pmc->P4.reserved[id]) || @@ -416,12 +416,12 @@ pmc_table_valid(pmc_id_t id) int pmc_machine_type(pmc_machine_t *type) { - x86_core_t *my_core = x86_core(); + x86_lcpu_t *my_lcpu = x86_lcpu(); pmc_table_t *pmc_table; - assert(my_core != NULL); + assert(my_lcpu != NULL); - pmc_table = (pmc_table_t *) my_core->pmc; + pmc_table = (pmc_table_t *) my_lcpu->pmc; if (pmc_table == NULL) return KERN_FAILURE; @@ -433,12 +433,12 @@ pmc_machine_type(pmc_machine_t *type) int pmc_reserve(pmc_id_t id) { - x86_core_t *my_core = x86_core(); + x86_lcpu_t *my_lcpu = x86_lcpu(); pmc_table_t *pmc_table; - assert(my_core != NULL); + assert(my_lcpu != NULL); - pmc_table = (pmc_table_t *) my_core->pmc; + pmc_table = (pmc_table_t *) my_lcpu->pmc; if (pmc_table == NULL) return KERN_FAILURE; if (id > pmc_table->id_max) diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 1f12073fb..d2efc8bc9 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ * * Implements the "wrappers" to the KEXT. */ +#include #include #include #include @@ -44,6 +45,14 @@ #include #include #include +#include + +/* + * Kernel parameter determining whether threads are halted unconditionally + * in the idle state. This is the default behavior. + * See machine_idle() for use. + */ +int idlehalt = 1; extern int disableConsoleOutput; @@ -54,185 +63,8 @@ decl_simple_lock_data(,pm_init_lock); */ pmDispatch_t *pmDispatch = NULL; -/* - * Current power management states (for use until KEXT is loaded). - */ -static pmInitState_t pmInitState; - static uint32_t pmInitDone = 0; -/* - * Nap control variables: - */ -uint32_t forcenap = 0; /* Force nap (fn) boot-arg controls */ - -/* - * Do any initialization needed - */ -void -pmsInit(void) -{ - static int initialized = 0; - - /* - * Initialize some of the initial state to "uninitialized" until - * it gets set with something more useful. This allows the KEXT - * to determine if the initial value was actually set to something. - */ - if (!initialized) { - pmInitState.PState = -1; - pmInitState.PLimit = -1; - pmInitState.maxBusDelay = -1; - initialized = 1; - } - - if (pmDispatch != NULL && pmDispatch->pmsInit != NULL) - (*pmDispatch->pmsInit)(); -} - -/* - * Start the power management stepper on all processors - * - * All processors must be parked. This should be called when the hardware - * is ready to step. Probably only at boot and after wake from sleep. - * - */ -void -pmsStart(void) -{ - if (pmDispatch != NULL && pmDispatch->pmsStart != NULL) - (*pmDispatch->pmsStart)(); -} - -/* - * Park the stepper execution. This will force the stepper on this - * processor to abandon its current step and stop. No changes to the - * hardware state is made and any previous step is lost. - * - * This is used as the initial state at startup and when the step table - * is being changed. - * - */ -void -pmsPark(void) -{ - if (pmDispatch != NULL && pmDispatch->pmsPark != NULL) - (*pmDispatch->pmsPark)(); -} - -/* - * Control the Power Management Stepper. - * Called from user state by the superuser. - * Interrupts disabled. - * - * This interface is deprecated and is now a no-op. - */ -kern_return_t -pmsControl(__unused uint32_t request, __unused user_addr_t reqaddr, - __unused uint32_t reqsize) -{ - return(KERN_SUCCESS); -} - -/* - * Broadcast a change to all processors including ourselves. - * - * Interrupts disabled. - */ -void -pmsRun(uint32_t nstep) -{ - if (pmDispatch != NULL && pmDispatch->pmsRun != NULL) - (*pmDispatch->pmsRun)(nstep); -} - -/* - * Build the tables needed for the stepper. This includes both the step - * definitions and the step control table. - * - * We most absolutely need to be parked before this happens because we're - * going to change the table. We also have to be complte about checking - * for errors. A copy is always made because we don't want to be crippled - * by not being able to change the table or description formats. - * - * We pass in a table of external functions and the new stepper def uses - * the corresponding indexes rather than actual function addresses. This - * is done so that a proper table can be built with the control syscall. - * It can't supply addresses, so the index has to do. We internalize the - * table so our caller does not need to keep it. Note that passing in a 0 - * will use the current function table. Also note that entry 0 is reserved - * and must be 0, we will check and fail the build. - * - * The platformData parameter is a 32-bit word of data that is passed unaltered - * to the set function. - * - * The queryFunc parameter is the address of a function that will return the - * current state of the platform. The format of the data returned is the same - * as the platform specific portions of pmsSetCmd, i.e., pmsXClk, pmsVoltage, - * and any part of pmsPowerID that is maintained by the platform hardware - * (an example would be the values of the gpios that correspond to pmsPowerID). - * The value should be constructed by querying hardware rather than returning - * a value cached by software. One of the intents of this function is to help - * recover lost or determine initial power states. - * - */ -kern_return_t -pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab, - uint32_t platformData, pmsQueryFunc_t queryFunc) -{ - kern_return_t rc = 0; - - if (pmDispatch != NULL && pmDispatch->pmsBuild != NULL) - rc = (*pmDispatch->pmsBuild)(pd, pdsize, functab, - platformData, queryFunc); - - return(rc); -} - - -/* - * Load a new ratio/VID table. - * - * Note that this interface is specific to the Intel SpeedStep implementation. - * It is expected that this will only be called once to override the default - * ratio/VID table when the platform starts. - * - * Normally, the table will need to be replaced at the same time that the - * stepper program proper is replaced, as the PState indices from an old - * program may no longer be valid. When replacing the default program this - * should not be a problem as any new table will have at least two PState - * entries and the default program only references P0 and P1. - */ -kern_return_t -pmsCPULoadVIDTable(uint16_t *tablep, int nstates) -{ - if (pmDispatch != NULL && pmDispatch->pmsCPULoadVIDTable != NULL) - return((*pmDispatch->pmsCPULoadVIDTable)(tablep, nstates)); - else { - int i; - - if (nstates > MAX_PSTATES) - return(KERN_FAILURE); - - for (i = 0; i < nstates; i += 1) - pmInitState.VIDTable[i] = tablep[i]; - } - return(KERN_SUCCESS); -} - -/* - * Set the (global) PState limit. CPUs will not be permitted to run at - * a lower (more performant) PState than this. - */ -kern_return_t -pmsCPUSetPStateLimit(uint32_t limit) -{ - if (pmDispatch != NULL && pmDispatch->pmsCPUSetPStateLimit != NULL) - return((*pmDispatch->pmsCPUSetPStateLimit)(limit)); - - pmInitState.PLimit = limit; - return(KERN_SUCCESS); -} /* * Initialize the Cstate change code. @@ -255,62 +87,55 @@ power_management_init(void) } /* - * ACPI calls the following routine to set/update mwait hints. A table - * (possibly null) specifies the available Cstates and their hints, all - * other states are assumed to be invalid. ACPI may update available - * states to change the nap policy (for example, while AC power is - * available). + * Called when the CPU is idle. It calls into the power management kext + * to determine the best way to idle the CPU. */ -kern_return_t -Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates) +void +machine_idle(void) { - if (forcenap) - return(KERN_SUCCESS); + cpu_data_t *my_cpu = current_cpu_datap(); - if (pmDispatch != NULL && pmDispatch->cstateTableSet != NULL) - return((*pmDispatch->cstateTableSet)(tablep, nstates)); - else { - unsigned int i; + if (my_cpu == NULL) + goto out; - for (i = 0; i < nstates; i += 1) { - pmInitState.CStates[i].number = tablep[i].number; - pmInitState.CStates[i].hint = tablep[i].hint; - } + /* + * If idlehalt isn't set, then don't do any power management related + * idle handling. + */ + if (!idlehalt) + goto out; + + my_cpu->lcpu.state = LCPU_IDLE; + DBGLOG(cpu_handle, cpu_number(), MP_IDLE); + MARK_CPU_IDLE(cpu_number()); - pmInitState.CStatesCount = nstates; + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->cstateMachineIdle != NULL) + (*pmDispatch->cstateMachineIdle)(0x7FFFFFFFFFFFFFFFULL); + else { + /* + * If no power management, re-enable interrupts and halt. + * This will keep the CPU from spinning through the scheduler + * and will allow at least some minimal power savings (but it + * cause problems in some MP configurations w.r.t. the APIC + * stopping during a GV3 transition). + */ + __asm__ volatile ("sti; hlt"); } - return(KERN_SUCCESS); -} -/* - * Called when the CPU is idle. It will choose the best C state to - * be in. - */ -void -machine_idle_cstate(boolean_t halted) -{ - if (pmInitDone - && pmDispatch != NULL - && pmDispatch->cstateMachineIdle != NULL) - (*pmDispatch->cstateMachineIdle)(!halted ? - 0x7FFFFFFFFFFFFFFFULL : 0ULL); - else if (halted) { - /* - * If no power managment and a processor is taken off-line, - * then invalidate the cache and halt it (it will not be able - * to be brought back on-line without resetting the CPU). - */ - __asm__ volatile ( "wbinvd; hlt" ); - } else { - /* - * If no power management, re-enable interrupts and halt. - * This will keep the CPU from spinning through the scheduler - * and will allow at least some minimal power savings (but it - * may cause problems in some MP configurations w.r.t to the - * APIC stopping during a P-State transition). - */ - __asm__ volatile ( "sti; hlt" ); - } + /* + * Mark the CPU as running again. + */ + MARK_CPU_ACTIVE(cpu_number()); + DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE); + my_cpu->lcpu.state = LCPU_RUN; + + /* + * Re-enable interrupts. + */ + out: + __asm__ volatile("sti"); } /* @@ -320,13 +145,16 @@ machine_idle_cstate(boolean_t halted) void pmCPUHalt(uint32_t reason) { + cpu_data_t *cpup = current_cpu_datap(); switch (reason) { case PM_HALT_DEBUG: + cpup->lcpu.state = LCPU_PAUSE; __asm__ volatile ("wbinvd; hlt"); break; case PM_HALT_PANIC: + cpup->lcpu.state = LCPU_PAUSE; __asm__ volatile ("cli; wbinvd; hlt"); break; @@ -337,31 +165,40 @@ pmCPUHalt(uint32_t reason) if (pmInitDone && pmDispatch != NULL && pmDispatch->pmCPUHalt != NULL) { + /* + * Halt the CPU (and put it in a low power state. + */ (*pmDispatch->pmCPUHalt)(); - } else { - cpu_data_t *cpup = current_cpu_datap(); + /* + * We've exited halt, so get the the CPU schedulable again. + */ + i386_init_slave_fast(); + + panic("init_slave_fast returned"); + } else { /* * If no power managment and a processor is taken off-line, * then invalidate the cache and halt it (it will not be able * to be brought back on-line without resetting the CPU). */ __asm__ volatile ("wbinvd"); - cpup->lcpu.halted = TRUE; + cpup->lcpu.state = LCPU_HALT; __asm__ volatile ( "wbinvd; hlt" ); + + panic("back from Halt"); } break; } } -/* - * Called to initialize the power management structures for the CPUs. - */ void -pmCPUStateInit(void) +pmMarkAllCPUsOff(void) { - if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL) - (*pmDispatch->pmCPUStateInit)(); + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->markAllCPUsOff != NULL) + (*pmDispatch->markAllCPUsOff)(); } static void @@ -398,6 +235,20 @@ pmGetMyCore(void) return(cpup->lcpu.core); } +static x86_die_t * +pmGetDie(int cpu) +{ + return(cpu_to_die(cpu)); +} + +static x86_die_t * +pmGetMyDie(void) +{ + cpu_data_t *cpup = current_cpu_datap(); + + return(cpup->lcpu.die); +} + static x86_pkg_t * pmGetPackage(int cpu) { @@ -409,7 +260,7 @@ pmGetMyPackage(void) { cpu_data_t *cpup = current_cpu_datap(); - return(cpup->lcpu.core->package); + return(cpup->lcpu.package); } static void @@ -484,29 +335,43 @@ pmCPUExitIdle(cpu_data_t *cpu) return(do_ipi); } +kern_return_t +pmCPUExitHalt(int cpu) +{ + kern_return_t rc = KERN_INVALID_ARGUMENT; + + if (pmInitDone + && pmDispatch != NULL + && pmDispatch->exitHalt != NULL) + rc = pmDispatch->exitHalt(cpu_to_lcpu(cpu)); + + return(rc); +} + /* - * Called when a CPU is being restarted after being powered off (as in S3). + * Called to initialize the power management structures for the CPUs. */ void -pmCPUMarkRunning(cpu_data_t *cpu) +pmCPUStateInit(void) { - if (pmInitDone - && pmDispatch != NULL - && pmDispatch->markCPURunning != NULL) - (*pmDispatch->markCPURunning)(&cpu->lcpu); + if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL) + (*pmDispatch->pmCPUStateInit)(); } /* - * Called from the HPET interrupt handler to perform the - * necessary power management work. + * Called when a CPU is being restarted after being powered off (as in S3). */ void -pmHPETInterrupt(void) +pmCPUMarkRunning(cpu_data_t *cpu) { + cpu_data_t *cpup = current_cpu_datap(); + if (pmInitDone && pmDispatch != NULL - && pmDispatch->HPETInterrupt != NULL) - (*pmDispatch->HPETInterrupt)(); + && pmDispatch->markCPURunning != NULL) + (*pmDispatch->markCPURunning)(&cpu->lcpu); + else + cpup->lcpu.state = LCPU_RUN; } /* @@ -524,6 +389,30 @@ pmCPUControl(uint32_t cmd, void *datap) return(rc); } +/* + * Called to save the timer state used by power management prior + * to "sleeping". + */ +void +pmTimerSave(void) +{ + if (pmDispatch != NULL + && pmDispatch->pmTimerStateSave != NULL) + (*pmDispatch->pmTimerStateSave)(); +} + +/* + * Called to restore the timer state used by power management after + * waking from "sleep". + */ +void +pmTimerRestore(void) +{ + if (pmDispatch != NULL + && pmDispatch->pmTimerStateRestore != NULL) + (*pmDispatch->pmTimerStateRestore)(); +} + /* * Set the worst-case time for the C4 to C2 transition. * No longer does anything. @@ -578,8 +467,29 @@ ml_set_maxbusdelay(uint32_t mdelay) if (pmDispatch != NULL && pmDispatch->setMaxBusDelay != NULL) pmDispatch->setMaxBusDelay(maxdelay); - else - pmInitState.maxBusDelay = maxdelay; +} + +uint64_t +ml_get_maxintdelay(void) +{ + uint64_t max_delay = 0; + + if (pmDispatch != NULL + && pmDispatch->getMaxIntDelay != NULL) + max_delay = pmDispatch->getMaxIntDelay(); + + return(max_delay); +} + +/* + * Set the maximum delay allowed for an interrupt. + */ +void +ml_set_maxintdelay(uint64_t mdelay) +{ + if (pmDispatch != NULL + && pmDispatch->setMaxIntDelay != NULL) + pmDispatch->setMaxIntDelay(mdelay); } /* @@ -602,15 +512,14 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags) * We only look at the PAUSE and RESUME flags. The other flag(s) * will not make any sense without the KEXT, so just ignore them. * - * We set the halted flag in the LCPU structure to indicate - * that this CPU isn't to do anything. If it's the CPU we're - * currently running on, then spin until the halted flag is - * reset. + * We set the CPU's state to indicate that it's halted. If this + * is the CPU we're currently running on, then spin until the + * state becomes non-halted. */ if (flags & PM_SAFE_FL_PAUSE) { - lcpu->halted = TRUE; + lcpu->state = LCPU_PAUSE; if (lcpu == x86_lcpu()) { - while (lcpu->halted) + while (lcpu->state == LCPU_PAUSE) cpu_pause(); } } @@ -620,7 +529,7 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags) * get it out of it's spin loop. */ if (flags & PM_SAFE_FL_RESUME) { - lcpu->halted = FALSE; + lcpu->state = LCPU_RUN; } } } @@ -657,21 +566,23 @@ pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, pmCallBacks_t *callbacks) { if (callbacks != NULL && version == PM_DISPATCH_VERSION) { - callbacks->InitState = &pmInitState; callbacks->setRTCPop = setPop; callbacks->resyncDeadlines = etimer_resync_deadlines; callbacks->initComplete= pmInitComplete; callbacks->GetLCPU = pmGetLogicalCPU; callbacks->GetCore = pmGetCore; + callbacks->GetDie = pmGetDie; callbacks->GetPackage = pmGetPackage; callbacks->GetMyLCPU = pmGetMyLogicalCPU; callbacks->GetMyCore = pmGetMyCore; + callbacks->GetMyDie = pmGetMyDie; callbacks->GetMyPackage= pmGetMyPackage; - callbacks->CoresPerPkg = cpuid_info()->cpuid_cores_per_package; callbacks->GetPkgRoot = pmGetPkgRoot; callbacks->LockCPUTopology = pmLockCPUTopology; callbacks->GetHibernate = pmCPUGetHibernate; callbacks->LCPUtoProcessor = pmLCPUtoProcessor; + callbacks->ThreadBind = thread_bind; + callbacks->topoParms = &topoParms; } if (cpuFuncs != NULL) { @@ -690,3 +601,42 @@ pmUnRegister(pmDispatch_t *cpuFuncs) } } +/****************************************************************************** + * + * All of the following are deprecated interfaces and no longer used. + * + ******************************************************************************/ +kern_return_t +pmsControl(__unused uint32_t request, __unused user_addr_t reqaddr, + __unused uint32_t reqsize) +{ + return(KERN_SUCCESS); +} + +void +pmsInit(void) +{ +} + +void +pmsStart(void) +{ +} + +void +pmsPark(void) +{ +} + +void +pmsRun(__unused uint32_t nstep) +{ +} + +kern_return_t +pmsBuild(__unused pmsDef *pd, __unused uint32_t pdsize, + __unused pmsSetFunc_t *functab, + __unused uint32_t platformData, __unused pmsQueryFunc_t queryFunc) +{ + return(KERN_SUCCESS); +} diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index 1ef88c3fe..ca3072b2a 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2007 Apple Inc. All rights reserved. + * Copyright (c) 2006-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,46 +29,15 @@ #ifndef _I386_PMCPU_H_ #define _I386_PMCPU_H_ -#include #include #ifndef ASSEMBLER -#define MAX_PSTATES 32 /* architectural limit */ - -typedef enum -{ - Cn1, Cn2, Cn3, Cn4, Cn5, Cn6, CnHlt, Cn0, CnRun, Cnmax -} Cstate_number_t; - -typedef struct -{ - Cstate_number_t number; - uint32_t hint; -} Cstate_hint_t; - - -struct pmData { - uint8_t pad[93]; -}; -typedef struct pmData pmData_t; - -#define pmNapHalt 0x00000010 -#define pmNapC1 0x00000008 -#define pmNapC2 0x00000004 -#define pmNapC3 0x00000002 -#define pmNapC4 0x00000001 -#define pmNapMask 0x000000FF - -#define cfgAdr 0xCF8 -#define cfgDat 0xCFC -#define lpcCfg (0x80000000 | (0 << 16) | (31 << 11) | (0 << 8)) - /* * This value should be changed each time that pmDsipatch_t or pmCallBacks_t * changes. */ -#define PM_DISPATCH_VERSION 7 +#define PM_DISPATCH_VERSION 12 /* * Dispatch table for functions that get installed when the power @@ -76,79 +45,55 @@ typedef struct pmData pmData_t; */ typedef struct { - /* - * The following are the stepper table interfaces. - */ int (*pmCPUStateInit)(void); - void (*pmsInit)(void); - void (*pmsStart)(void); - void (*pmsPark)(void); - kern_return_t (*pmsCPUSetPStateLimit)(uint32_t limit); - - /* - * The following are legacy stepper interfaces. - */ - void (*pmsRun)(uint32_t nstep); - kern_return_t (*pmsBuild)(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab, uint32_t platformData, pmsQueryFunc_t queryFunc); - kern_return_t (*pmsCPULoadVIDTable)(uint16_t *tablep, int nstates); /* * The following are the 'C' State interfaces. */ void (*cstateInit)(void); uint64_t (*cstateMachineIdle)(uint64_t maxIdleDuration); - kern_return_t (*cstateTableSet)(Cstate_hint_t *tablep, unsigned int nstates); uint64_t (*GetDeadline)(x86_lcpu_t *lcpu); uint64_t (*SetDeadline)(x86_lcpu_t *lcpu, uint64_t); void (*Deadline)(x86_lcpu_t *lcpu); boolean_t (*exitIdle)(x86_lcpu_t *lcpu); void (*markCPURunning)(x86_lcpu_t *lcpu); - void (*HPETInterrupt)(void); int (*pmCPUControl)(uint32_t cmd, void *datap); void (*pmCPUHalt)(void); uint64_t (*getMaxSnoop)(void); void (*setMaxBusDelay)(uint64_t time); uint64_t (*getMaxBusDelay)(void); + void (*setMaxIntDelay)(uint64_t time); + uint64_t (*getMaxIntDelay)(void); void (*pmCPUSafeMode)(x86_lcpu_t *lcpu, uint32_t flags); + void (*pmTimerStateSave)(void); + void (*pmTimerStateRestore)(void); + kern_return_t (*exitHalt)(x86_lcpu_t *lcpu); + void (*markAllCPUsOff)(void); } pmDispatch_t; typedef struct { - uint32_t PState; - uint32_t PLimit; - uint16_t VIDTable[MAX_PSTATES]; - uint32_t VIDTableCount; - Cstate_hint_t CStates[Cnmax]; - uint32_t CStatesCount; - uint64_t maxBusDelay; -} pmInitState_t; - -typedef struct { - uint64_t *(*HPETAddr)(void); - pmInitState_t *InitState; int (*setRTCPop)(uint64_t time); void (*resyncDeadlines)(void); void (*initComplete)(void); x86_lcpu_t *(*GetLCPU)(int cpu); x86_core_t *(*GetCore)(int cpu); + x86_die_t *(*GetDie)(int cpu); x86_pkg_t *(*GetPackage)(int cpu); x86_lcpu_t *(*GetMyLCPU)(void); x86_core_t *(*GetMyCore)(void); + x86_die_t *(*GetMyDie)(void); x86_pkg_t *(*GetMyPackage)(void); - uint32_t CoresPerPkg; x86_pkg_t *(*GetPkgRoot)(void); void (*LockCPUTopology)(int lock); boolean_t (*GetHibernate)(int cpu); processor_t (*LCPUtoProcessor)(int lcpu); + processor_t (*ThreadBind)(processor_t proc); + x86_topology_parameters_t *topoParms; } pmCallBacks_t; extern pmDispatch_t *pmDispatch; -extern uint32_t forcenap; - void power_management_init(void); -void machine_nap_policy(void); -kern_return_t Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates); -void machine_idle_cstate(boolean_t halted); void pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, pmCallBacks_t *callbacks); void pmUnRegister(pmDispatch_t *cpuFuncs); @@ -158,13 +103,17 @@ uint64_t pmCPUSetDeadline(struct cpu_data *cpu, uint64_t deadline); void pmCPUDeadline(struct cpu_data *cpu); boolean_t pmCPUExitIdle(struct cpu_data *cpu); void pmCPUMarkRunning(struct cpu_data *cpu); -void pmHPETInterrupt(void); +void pmMarkAllCPUsOff(void); int pmCPUControl(uint32_t cmd, void *datap); void pmCPUHalt(uint32_t reason); +void pmTimerSave(void); +void pmTimerRestore(void); +kern_return_t pmCPUExitHalt(int cpu); #define PM_HALT_NORMAL 0 /* normal halt path */ #define PM_HALT_DEBUG 1 /* debug code wants to halt */ #define PM_HALT_PANIC 2 /* panic code wants to halt */ +#define PM_HALT_SLEEP 3 /* sleep code wants to halt */ void pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags); @@ -174,6 +123,14 @@ void pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags); #define PM_SAFE_FL_RESUME 0x00000020 /* resume execution on the CPU */ extern int pmsafe_debug; +extern int idlehalt; + +/****************************************************************************** + * + * All of the following are deprecated interfaces and no longer used. + * + ******************************************************************************/ + #endif /* ASSEMBLER */ diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index b83947193..a424d7e11 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -1215,7 +1215,7 @@ pmap_bootstrap( virtual_avail = va; - if (PE_parse_boot_arg("npvhash", &npvhash)) { + if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) { if (0 != ((npvhash+1) & npvhash)) { kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH); npvhash = NPVHASH; @@ -1226,7 +1226,7 @@ pmap_bootstrap( printf("npvhash=%d\n",npvhash); wpkernel = 1; - if (PE_parse_boot_arg("wpkernel", &boot_arg)) { + if (PE_parse_boot_argn("wpkernel", &boot_arg, sizeof (boot_arg))) { if (boot_arg == 0) wpkernel = 0; } @@ -1331,12 +1331,12 @@ pmap_bootstrap( * By default for 64-bit users loaded at 4GB, share kernel mapping. * But this may be overridden by the -no_shared_cr3 boot-arg. */ - if (PE_parse_boot_arg("-no_shared_cr3", &no_shared_cr3)) { + if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3, sizeof (no_shared_cr3))) { kprintf("Shared kernel address space disabled\n"); } #ifdef PMAP_TRACES - if (PE_parse_boot_arg("-pmap_trace", &pmap_trace)) { + if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) { kprintf("Kernel traces for pmap operations enabled\n"); } #endif /* PMAP_TRACES */ @@ -4573,8 +4573,10 @@ pmap_flush_tlbs(pmap_t pmap) */ while (cpus_to_respond != 0) { if (mach_absolute_time() > deadline) { - pmap_tlb_flush_timeout = TRUE; - pmap_cpuset_NMIPI(cpus_to_respond); + if (!panic_active()) { + pmap_tlb_flush_timeout = TRUE; + pmap_cpuset_NMIPI(cpus_to_respond); + } panic("pmap_flush_tlbs() timeout: " "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx", pmap, cpus_to_respond); diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index ee5fe6de1..a74ca7548 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -275,7 +275,7 @@ static inline void invlpg(unsigned long addr) __asm__ volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi)) #define rdtsc(lo,hi) \ - __asm__ volatile("rdtsc" : "=a" (lo), "=d" (hi)) + __asm__ volatile("rdtsc; lfence" : "=a" (lo), "=d" (hi)) #define write_tsc(lo,hi) wrmsr(0x10, lo, hi) @@ -297,7 +297,7 @@ static inline void wrmsr64(uint32_t msr, uint64_t val) static inline uint64_t rdtsc64(void) { uint64_t ret; - __asm__ volatile("rdtsc" : "=A" (ret)); + __asm__ volatile("rdtsc; lfence" : "=A" (ret)); return ret; } @@ -410,4 +410,6 @@ __END_DECLS #define MSR_IA32_GS_BASE 0xC0000101 #define MSR_IA32_KERNEL_GS_BASE 0xC0000102 +#define MSR_IA32_BIOS_SIGN_ID 0x08B + #endif /* _I386_PROC_REG_H_ */ diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index a1784f3bf..982c160f4 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include @@ -71,7 +71,6 @@ #include #include #include -#include #include #define NSEC_PER_HZ (NSEC_PER_SEC / 100) /* nsec per tick */ @@ -93,20 +92,8 @@ extern clock_timer_func_t rtclock_timer_expire; static void rtc_set_timescale(uint64_t cycles); static uint64_t rtc_export_speed(uint64_t cycles); -extern void _rtc_nanotime_store( - uint64_t tsc, - uint64_t nsec, - uint32_t scale, - uint32_t shift, - rtc_nanotime_t *dst); - -extern uint64_t _rtc_nanotime_read( - rtc_nanotime_t *rntp, - int slow ); - rtc_nanotime_t rtc_nanotime_info = {0,0,0,0,1,0}; - /* * tsc_to_nanoseconds: * @@ -124,7 +111,9 @@ _tsc_to_nanoseconds(uint64_t value) "mull %%ecx ;" "addl %%edi,%%eax ;" "adcl $0,%%edx " - : "+A" (value) : "c" (rtc_nanotime_info.scale) : "esi", "edi"); + : "+A" (value) + : "c" (current_cpu_datap()->cpu_nanotime->scale) + : "esi", "edi"); return (value); } @@ -212,7 +201,7 @@ _rtc_nanotime_init(rtc_nanotime_t *rntp, uint64_t base) static void rtc_nanotime_init(uint64_t base) { - rtc_nanotime_t *rntp = &rtc_nanotime_info; + rtc_nanotime_t *rntp = current_cpu_datap()->cpu_nanotime; _rtc_nanotime_init(rntp, base); rtc_nanotime_set_commpage(rntp); @@ -230,7 +219,7 @@ rtc_nanotime_init_commpage(void) { spl_t s = splclock(); - rtc_nanotime_set_commpage(&rtc_nanotime_info); + rtc_nanotime_set_commpage(current_cpu_datap()->cpu_nanotime); splx(s); } @@ -247,10 +236,10 @@ rtc_nanotime_read(void) #if CONFIG_EMBEDDED if (gPEClockFrequencyInfo.timebase_frequency_hz > SLOW_TSC_THRESHOLD) - return _rtc_nanotime_read( &rtc_nanotime_info, 1 ); /* slow processor */ + return _rtc_nanotime_read(current_cpu_datap()->cpu_nanotime, 1); /* slow processor */ else #endif - return _rtc_nanotime_read( &rtc_nanotime_info, 0 ); /* assume fast processor */ + return _rtc_nanotime_read(current_cpu_datap()->cpu_nanotime, 0); /* assume fast processor */ } /* @@ -263,7 +252,7 @@ rtc_nanotime_read(void) void rtc_clock_napped(uint64_t base, uint64_t tsc_base) { - rtc_nanotime_t *rntp = &rtc_nanotime_info; + rtc_nanotime_t *rntp = current_cpu_datap()->cpu_nanotime; uint64_t oldnsecs; uint64_t newnsecs; uint64_t tsc; @@ -372,12 +361,13 @@ rtclock_init(void) static void rtc_set_timescale(uint64_t cycles) { - rtc_nanotime_info.scale = ((uint64_t)NSEC_PER_SEC << 32) / cycles; + rtc_nanotime_t *rntp = current_cpu_datap()->cpu_nanotime; + rntp->scale = ((uint64_t)NSEC_PER_SEC << 32) / cycles; if (cycles <= SLOW_TSC_THRESHOLD) - rtc_nanotime_info.shift = cycles; + rntp->shift = cycles; else - rtc_nanotime_info.shift = 32; + rntp->shift = 32; rtc_nanotime_init(0); } diff --git a/osfmk/i386/rtclock.h b/osfmk/i386/rtclock.h index 904b38786..e3ea716d4 100644 --- a/osfmk/i386/rtclock.h +++ b/osfmk/i386/rtclock.h @@ -40,8 +40,90 @@ #ifndef _I386_RTCLOCK_H_ #define _I386_RTCLOCK_H_ +#ifndef ASSEMBLER +typedef struct rtc_nanotime { + uint64_t tsc_base; /* timestamp */ + uint64_t ns_base; /* nanoseconds */ + uint32_t scale; /* tsc -> nanosec multiplier */ + uint32_t shift; /* tsc -> nanosec shift/div */ + /* shift is overloaded with + * lower 32bits of tsc_freq + * on slower machines (SLOW_TSC_THRESHOLD) */ + uint32_t generation; /* 0 == being updated */ + uint32_t spare1; +} rtc_nanotime_t; + #include struct cpu_data; +extern void _rtc_nanotime_store( + uint64_t tsc, + uint64_t nsec, + uint32_t scale, + uint32_t shift, + rtc_nanotime_t *dst); + +extern uint64_t _rtc_nanotime_read( + rtc_nanotime_t *rntp, + int slow); + +extern rtc_nanotime_t rtc_nanotime_info; +#endif + +#define SLOW_TSC_THRESHOLD 1000067800 /* TSC is too slow for regular nanotime() algorithm */ + +#if defined(__i386__) +/* + * Assembly snippet included in exception handlers and rtc_nanotime_read() + * %edi points to nanotime info struct + * %edx:%eax returns nanotime + */ +#define RTC_NANOTIME_READ_FAST() \ +0: movl RNT_GENERATION(%edi),%esi /* being updated? */ ; \ + testl %esi,%esi ; \ + jz 0b /* wait until done */ ; \ + rdtsc ; \ + lfence ; \ + subl RNT_TSC_BASE(%edi),%eax ; \ + sbbl RNT_TSC_BASE+4(%edi),%edx /* tsc - tsc_base */ ; \ + movl RNT_SCALE(%edi),%ecx /* * scale factor */ ; \ + movl %edx,%ebx ; \ + mull %ecx ; \ + movl %ebx,%eax ; \ + movl %edx,%ebx ; \ + mull %ecx ; \ + addl %ebx,%eax ; \ + adcl $0,%edx ; \ + addl RNT_NS_BASE(%edi),%eax /* + ns_base */ ; \ + adcl RNT_NS_BASE+4(%edi),%edx ; \ + cmpl RNT_GENERATION(%edi),%esi /* check for update */ ; \ + jne 0b /* do it all again */ + +#elif defined(__x86_64__) + +/* + * Assembly snippet included in exception handlers and rtc_nanotime_read() + * %rdi points to nanotime info struct. + * %rax returns nanotime + */ +#define RTC_NANOTIME_READ_FAST() \ +0: movl RNT_GENERATION(%rdi),%esi ; \ + test %esi,%esi /* info updating? */ ; \ + jz 0b /* - wait if so */ ; \ + rdtsc ; \ + lfence ; \ + shlq $32,%rdx ; \ + orq %rdx,%rax /* %rax := tsc */ ; \ + subq RNT_TSC_BASE(%rdi),%rax /* tsc - tsc_base */ ; \ + xorq %rcx,%rcx ; \ + movl RNT_SCALE(%rdi),%ecx ; \ + mulq %rcx /* delta * scale */ ; \ + shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */ ; \ + addq RNT_NS_BASE(%rdi),%rax /* add ns_base */ ; \ + cmpl RNT_GENERATION(%rdi),%esi /* repeat if changed */ ; \ + jne 0b + +#endif + #endif /* _I386_RTCLOCK_H_ */ diff --git a/osfmk/i386/start.s b/osfmk/i386/start.s index 3218d3d15..e063283aa 100644 --- a/osfmk/i386/start.s +++ b/osfmk/i386/start.s @@ -66,7 +66,7 @@ #define CX(addr,reg) addr(,reg,4) -#include +#include #include #include diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index d65419a23..b263be9ff 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -112,6 +112,9 @@ #include #include + +extern void throttle_lowpri_io(boolean_t); + /* * Forward declarations */ @@ -163,7 +166,9 @@ thread_syscall_return( } regs->eax = ret; } - thread_exception_return(); + throttle_lowpri_io(TRUE); + + thread_exception_return(); /*NOTREACHED*/ } diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 19b7469a6..624e5d431 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -75,6 +75,10 @@ uint64_t tscFCvtn2t = 0; uint64_t tscGranularity = 0; uint64_t bus2tsc = 0; uint64_t busFreq = 0; +uint32_t flex_ratio = 0; +uint32_t flex_ratio_min = 0; +uint32_t flex_ratio_max = 0; + #define bit(n) (1ULL << (n)) #define bitmask(h,l) ((bit(h)|(bit(h)-1)) & ~(bit(l)-1)) @@ -91,8 +95,7 @@ uint64_t busFreq = 0; static const char FSB_Frequency_prop[] = "FSBFrequency"; /* - * This routine extracts the front-side bus frequency in Hz from - * the device tree. + * This routine extracts the bus frequency in Hz from the device tree. */ static uint64_t EFI_FSB_frequency(void) @@ -136,25 +139,39 @@ tsc_init(void) boolean_t N_by_2_bus_ratio = FALSE; /* - * Get the FSB frequency and conversion factors. + * Get the FSB frequency and conversion factors from EFI. */ busFreq = EFI_FSB_frequency(); + + if (cpuid_info()->cpuid_family != CPU_FAMILY_PENTIUM_M) { + panic("tsc_init: unknown CPU family: 0x%X\n", + cpuid_info()->cpuid_family); + } + + { + uint64_t prfsts; + + prfsts = rdmsr64(IA32_PERF_STS); + tscGranularity = (uint32_t)bitfield(prfsts, 44, 40); + N_by_2_bus_ratio = (prfsts & bit(46)) != 0; + } + if (busFreq != 0) { busFCvtt2n = ((1 * Giga) << 32) / busFreq; busFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / busFCvtt2n; busFCvtInt = tmrCvt(1 * Peta, 0xFFFFFFFFFFFFFFFFULL / busFreq); } else { - panic("rtclock_init: EFI not supported!\n"); + panic("tsc_init: EFI not supported!\n"); } kprintf(" BUS: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, " - "cvtInt = %08X.%08X\n", - (uint32_t)(busFreq / Mega), - (uint32_t)(busFreq % Mega), - (uint32_t)(busFCvtt2n >> 32), (uint32_t)busFCvtt2n, - (uint32_t)(busFCvtn2t >> 32), (uint32_t)busFCvtn2t, - (uint32_t)(busFCvtInt >> 32), (uint32_t)busFCvtInt); + "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, " + "cvtInt = %08X.%08X\n", + (uint32_t)(busFreq / Mega), + (uint32_t)(busFreq % Mega), + (uint32_t)(busFCvtt2n >> 32), (uint32_t)busFCvtt2n, + (uint32_t)(busFCvtn2t >> 32), (uint32_t)busFCvtn2t, + (uint32_t)(busFCvtInt >> 32), (uint32_t)busFCvtInt); /* * Get the TSC increment. The TSC is incremented by this @@ -164,18 +181,6 @@ tsc_init(void) * is set this indicates the bus ration is 0.5 more than this - i.e. * that the true bus ratio is (2*tscGranularity + 1)/2. */ - if (cpuid_info()->cpuid_family == CPU_FAMILY_PENTIUM_M) { - uint64_t prfsts; - - prfsts = rdmsr64(IA32_PERF_STS); - tscGranularity = (uint32_t)bitfield(prfsts, 44, 40); - N_by_2_bus_ratio = (prfsts & bit(46)) != 0; - - } else { - panic("rtclock_init: unknown CPU family: 0x%X\n", - cpuid_info()->cpuid_family); - } - if (N_by_2_bus_ratio) tscFCvtt2n = busFCvtt2n * 2 / (1 + 2*tscGranularity); else @@ -185,12 +190,12 @@ tsc_init(void) tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n; kprintf(" TSC: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n", - (uint32_t)(tscFreq / Mega), - (uint32_t)(tscFreq % Mega), - (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n, - (uint32_t)(tscFCvtn2t >> 32), (uint32_t)tscFCvtn2t, - tscGranularity, N_by_2_bus_ratio ? " (N/2)" : ""); + "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n", + (uint32_t)(tscFreq / Mega), + (uint32_t)(tscFreq % Mega), + (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n, + (uint32_t)(tscFCvtn2t >> 32), (uint32_t)tscFCvtn2t, + tscGranularity, N_by_2_bus_ratio ? " (N/2)" : ""); /* * Calculate conversion from BUS to TSC @@ -209,4 +214,7 @@ tsc_get_info(tscInfo_t *info) info->tscGranularity = tscGranularity; info->bus2tsc = bus2tsc; info->busFreq = busFreq; + info->flex_ratio = flex_ratio; + info->flex_ratio_min = flex_ratio_min; + info->flex_ratio_max = flex_ratio_max; } diff --git a/osfmk/i386/tsc.h b/osfmk/i386/tsc.h index 79ece7085..1b6589de7 100644 --- a/osfmk/i386/tsc.h +++ b/osfmk/i386/tsc.h @@ -40,7 +40,7 @@ #ifndef _I386_TSC_H_ #define _I386_TSC_H_ -#define IA32_PERF_STS 0x198 +#define IA32_PERF_STS 0x198 extern uint64_t busFCvtt2n; extern uint64_t busFCvtn2t; @@ -50,17 +50,23 @@ extern uint64_t tscFCvtn2t; extern uint64_t tscGranularity; extern uint64_t bus2tsc; extern uint64_t busFreq; +extern uint32_t flex_ratio; +extern uint32_t flex_ratio_min; +extern uint32_t flex_ratio_max; struct tscInfo { -uint64_t busFCvtt2n; -uint64_t busFCvtn2t; -uint64_t tscFreq; -uint64_t tscFCvtt2n; -uint64_t tscFCvtn2t; -uint64_t tscGranularity; -uint64_t bus2tsc; -uint64_t busFreq; + uint64_t busFCvtt2n; + uint64_t busFCvtn2t; + uint64_t tscFreq; + uint64_t tscFCvtt2n; + uint64_t tscFCvtn2t; + uint64_t tscGranularity; + uint64_t bus2tsc; + uint64_t busFreq; + uint32_t flex_ratio; + uint32_t flex_ratio_min; + uint32_t flex_ratio_max; }; typedef struct tscInfo tscInfo_t; diff --git a/osfmk/i386/user_ldt.c b/osfmk/i386/user_ldt.c index 87aab0426..e06afda78 100644 --- a/osfmk/i386/user_ldt.c +++ b/osfmk/i386/user_ldt.c @@ -152,7 +152,7 @@ i386_set_ldt( start_sel = LDTSZ_MIN; } - if (start_sel + num_sels > LDTSZ) { + if ((uint64_t)start_sel + (uint64_t)num_sels > LDTSZ) { task_unlock(task); return ENOMEM; } @@ -294,7 +294,7 @@ i386_get_ldt( if (start_sel >= 8192) return EINVAL; - if (start_sel + num_sels > 8192) + if ((uint64_t)start_sel + (uint64_t)num_sels > 8192) return EINVAL; if (descs == 0) return EINVAL; diff --git a/osfmk/kdp/kdp.h b/osfmk/kdp/kdp.h index 9d5b91352..ab846c753 100644 --- a/osfmk/kdp/kdp.h +++ b/osfmk/kdp/kdp.h @@ -45,3 +45,5 @@ kdp_raise_exception( void kdp_reset(void); +void +kdp_init(void); diff --git a/osfmk/kdp/kdp_serial.c b/osfmk/kdp/kdp_serial.c new file mode 100644 index 000000000..bc8f1369b --- /dev/null +++ b/osfmk/kdp/kdp_serial.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2008 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include "kdp_serial.h" + +#define SKDP_START_CHAR 0xFA +#define SKDP_END_CHAR 0xFB +#define SKDP_ESC_CHAR 0xFE + +static enum {DS_WAITSTART, DS_READING, DS_ESCAPED} dsState; +static unsigned char dsBuffer[1518]; +static int dsPos; + +void kdp_serialize_packet(unsigned char *packet, unsigned int len, void (*outFunc)(char)) +{ + unsigned int index; + outFunc(SKDP_START_CHAR); + for (index = 0; index < len; index++) { + unsigned char byte = *packet++; + //need to escape '\n' because the kernel serial output turns it into a cr/lf + if(byte == SKDP_START_CHAR || byte == SKDP_END_CHAR || byte == SKDP_ESC_CHAR || byte == '\n') + { + outFunc(SKDP_ESC_CHAR); + byte = ~byte; + } + outFunc(byte); + } + outFunc(SKDP_END_CHAR); +} + +unsigned char *kdp_unserialize_packet(unsigned char byte, unsigned int *len) +{ + switch(dsState) + { + case DS_WAITSTART: + if(byte == SKDP_START_CHAR) + { +// printf("got start char\n"); + dsState = DS_READING; + dsPos = 0; + *len = SERIALIZE_READING; + return 0; + } + *len = SERIALIZE_WAIT_START; + break; + case DS_READING: + if(byte == SKDP_ESC_CHAR) + { + dsState = DS_ESCAPED; + *len = SERIALIZE_READING; + return 0; + } + if(byte == SKDP_START_CHAR) + { +// printf("unexpected start char, resetting\n"); + dsPos = 0; + *len = SERIALIZE_READING; + return 0; + } + if(byte == SKDP_END_CHAR) + { + dsState = DS_WAITSTART; + *len = dsPos; + dsPos = 0; + return dsBuffer; + } + dsBuffer[dsPos++] = byte; + break; + case DS_ESCAPED: +// printf("unescaping %02x to %02x\n", byte, ~byte); + dsBuffer[dsPos++] = ~byte; + dsState = DS_READING; + *len = SERIALIZE_READING; + break; + } + if(dsPos == sizeof(dsBuffer)) //too much data...forget this packet + { + dsState = DS_WAITSTART; + dsPos = 0; + *len = SERIALIZE_WAIT_START; + } + + return 0; +} diff --git a/osfmk/i386/hw_defs.h b/osfmk/kdp/kdp_serial.h similarity index 63% rename from osfmk/i386/hw_defs.h rename to osfmk/kdp/kdp_serial.h index 0fac10f3c..68dc30129 100644 --- a/osfmk/i386/hw_defs.h +++ b/osfmk/kdp/kdp_serial.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,21 +25,27 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _I386_HW_DEFS_H_ -#define _I386_HW_DEFS_H_ +#ifndef _KDP_SERIAL_H_ +#define _KDP_SERIAL_H_ +/* + * APIs for escaping a KDP UDP packet into a byte stream suitable + * for a standard serial console + */ -#define pmMwaitC1 0x00 -#define pmMwaitC2 0x10 -#define pmMwaitC3 0x20 -#define pmMwaitC4 0x30 -#define pmMwaitBrInt 0x1 +enum {SERIALIZE_WAIT_START, SERIALIZE_READING}; -#define pmBase 0x400 -#define pmCtl1 0x04 -#define pmCtl2 0x20 -#define pmC3Res 0x54 -#define pmStatus 0x00 -#define msrTSC 0x10 +/* + * Take a buffer of specified length and output it with the given + * function. Escapes special characters as needed + */ +void kdp_serialize_packet(unsigned char *, unsigned int, void (*func)(char)); + +/* + * Add a new character to an internal buffer, and return that + * buffer when a fully constructed packet has been identified. + * Will track intermediate state using magic enums above + */ +unsigned char *kdp_unserialize_packet(unsigned char, unsigned int *); -#endif /* _I386_HW_DEFS_H_ */ +#endif diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index e47f63dfc..1575afcca 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,11 +41,16 @@ #include #include #include +#include #include #include #include +#include #include +#if CONFIG_SERIAL_KDP +#include +#endif #include #include @@ -60,6 +65,10 @@ extern int kdp_getc(void); extern int reattach_wait; +extern int serial_getc(void); +extern void serial_putc(char); +extern int serial_init(void); + static u_short ip_id; /* ip packet ctr, for ids */ /* @(#)udp_usrreq.c 2.2 88/05/23 4.0NFSSRC SMI; from UCB 7.1 6/5/86 */ @@ -219,19 +228,19 @@ kdp_register_send_receive( { unsigned int debug = 0; - kdp_en_send_pkt = send; - kdp_en_recv_pkt = receive; - debug_log_init(); kdp_timer_callout_init(); - PE_parse_boot_arg("debug", &debug); + PE_parse_boot_argn("debug", &debug, sizeof (debug)); if (!debug) return; + kdp_en_send_pkt = send; + kdp_en_recv_pkt = receive; + if (debug & DB_KDP_BP_DIS) kdp_flag |= KDP_BP_DIS; if (debug & DB_KDP_GETC_ENA) @@ -250,13 +259,13 @@ kdp_register_send_receive( if (debug & DB_PANICLOG_DUMP) kdp_flag |= PANIC_LOG_DUMP; - if (PE_parse_boot_arg ("_panicd_ip", panicd_ip_str)) + if (PE_parse_boot_argn("_panicd_ip", panicd_ip_str, sizeof (panicd_ip_str))) panicd_specified = TRUE; - if (PE_parse_boot_arg ("_router_ip", router_ip_str)) + if (PE_parse_boot_argn("_router_ip", router_ip_str, sizeof (router_ip_str))) router_specified = TRUE; - if (!PE_parse_boot_arg ("panicd_port", &panicd_port)) + if (!PE_parse_boot_argn("panicd_port", &panicd_port, sizeof (panicd_port))) panicd_port = CORE_REMOTE_PORT; kdp_flag |= KDP_READY; @@ -1438,7 +1447,6 @@ kdp_get_xnu_version(char *versionbuf) } extern char *inet_aton(const char *cp, struct in_addr *pin); -extern int snprintf(char *str, size_t size, const char *format, ...); /* Primary dispatch routine for the system dump */ void @@ -1558,3 +1566,111 @@ abort_panic_transfer(void) not_in_kdp = 1; panic_block = 0; } + +#if CONFIG_SERIAL_KDP + +static boolean_t needs_serial_init = TRUE; + +static void +kdp_serial_send(void *rpkt, unsigned int rpkt_len) +{ + if (needs_serial_init) + { + serial_init(); + needs_serial_init = FALSE; + } + + // printf("tx\n"); + kdp_serialize_packet((unsigned char *)rpkt, rpkt_len, serial_putc); +} + +static void +kdp_serial_receive(void *rpkt, unsigned int *rpkt_len, unsigned int timeout) +{ + int readkar; + uint64_t now, deadline; + + if (needs_serial_init) + { + serial_init(); + needs_serial_init = FALSE; + } + + clock_interval_to_deadline(timeout, 1000 * 1000 /* milliseconds */, &deadline); + +// printf("rx\n"); + for(clock_get_uptime(&now); now < deadline; clock_get_uptime(&now)) + { + readkar = serial_getc(); + if(readkar >= 0) + { + unsigned char *packet; + // printf("got char %02x\n", readkar); + if((packet = kdp_unserialize_packet(readkar,rpkt_len))) + { + memcpy(rpkt, packet, *rpkt_len); + return; + } + } + } + *rpkt_len = 0; +} + +static void kdp_serial_callout(__unused void *arg, kdp_event_t event) +{ + /* When we stop KDP, set the bit to re-initialize the console serial port + * the next time we send/receive a KDP packet. We don't do it on + * KDP_EVENT_ENTER directly because it also gets called when we trap to KDP + * for non-external debugging, i.e., stackshot or core dumps. + * + * Set needs_serial_init on exit (and initialization, see above) and not + * enter because enter is sent multiple times and causes excess reinitialization. + */ + + switch (event) + { + case KDP_EVENT_PANICLOG: + case KDP_EVENT_ENTER: + break; + case KDP_EVENT_EXIT: + needs_serial_init = TRUE; + break; + } +} + +#endif /* CONFIG_SERIAL_KDP */ + +void +kdp_init(void) +{ +#if CONFIG_SERIAL_KDP + char kdpname[80]; + struct in_addr ipaddr; + struct ether_addr macaddr; + +#if CONFIG_EMBEDDED + //serial will be the debugger, unless match name is explicitly provided, and it's not "serial" + if(PE_parse_boot_argn("kdp_match_name", kdpname, sizeof(kdpname)) && strncmp(kdpname, "serial", sizeof(kdpname)) != 0) + return; +#else + // serial must be explicitly requested + if(!PE_parse_boot_argn("kdp_match_name", kdpname, sizeof(kdpname)) || strncmp(kdpname, "serial", sizeof(kdpname)) != 0) + return; +#endif + + kprintf("Intializing serial KDP\n"); + + kdp_register_callout(kdp_serial_callout, NULL); + kdp_register_send_receive(kdp_serial_send, kdp_serial_receive); + + /* fake up an ip and mac for early serial debugging */ + macaddr.ether_addr_octet[0] = 's'; + macaddr.ether_addr_octet[1] = 'e'; + macaddr.ether_addr_octet[2] = 'r'; + macaddr.ether_addr_octet[3] = 'i'; + macaddr.ether_addr_octet[4] = 'a'; + macaddr.ether_addr_octet[5] = 'l'; + ipaddr.s_addr = 0xABADBABE; + kdp_set_ip_and_mac_addresses(&ipaddr, &macaddr); +#endif /* CONFIG_SERIAL_KDP */ +} diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index d09e4ea6e..c13e40826 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -245,6 +245,22 @@ ipc_space_t get_task_ipcspace(task_t t) return(t->itk_space); } +int get_task_numactivethreads(task_t task) +{ + thread_t inc; + int num_active_thr=0; + task_lock(task); + + for (inc = (thread_t)queue_first(&task->threads); + !queue_end(&task->threads, (queue_entry_t)inc); inc = (thread_t)queue_next(&inc->task_threads)) + { + if(inc->active) + num_active_thr++; + } + task_unlock(task); + return num_active_thr; +} + int get_task_numacts(task_t t) { return(t->thread_count); diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index c7e01a134..679e1779c 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -106,7 +106,7 @@ unsigned int panic_is_inited = 0; unsigned int return_on_panic = 0; unsigned long panic_caller; -char *debug_buf; +char debug_buf[PAGE_SIZE]; ppnum_t debug_buf_page; char *debug_buf_ptr; unsigned int debug_buf_size; @@ -183,9 +183,6 @@ debug_log_init(void) { if (debug_buf_size != 0) return; - if (kmem_alloc(kernel_map, (vm_offset_t *) &debug_buf, PAGE_SIZE) - != KERN_SUCCESS) - panic("cannot allocate debug_buf\n"); debug_buf_ptr = debug_buf; debug_buf_size = PAGE_SIZE; debug_buf_page = pmap_find_phys(kernel_pmap, diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 4f2ab7f87..e861592e6 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -35,6 +35,7 @@ #ifdef KERNEL_PRIVATE extern unsigned int systemLogDiags; +extern char debug_buf[]; #ifdef MACH_KERNEL_PRIVATE @@ -49,7 +50,7 @@ extern unsigned int current_debugger; extern unsigned int active_debugger; extern unsigned int debug_mode; -extern unsigned int disable_debug_output; +extern unsigned int disable_debug_output; extern unsigned int panicDebugging; extern unsigned int logPanicDataToScreen; @@ -71,7 +72,6 @@ extern const char *panicstr; extern volatile unsigned int nestedpanic; extern int unsigned long panic_caller; -extern char *debug_buf; extern char *debug_buf_ptr; extern unsigned int debug_buf_size; @@ -103,6 +103,7 @@ void panic_display_system_configuration(void); #define DB_DBG_POST_CORE 0x1000 /*Wait in debugger after NMI core */ #define DB_PANICLOG_DUMP 0x2000 /* Send paniclog on panic,not core*/ + #endif /* KERNEL_PRIVATE */ __BEGIN_DECLS diff --git a/osfmk/kern/kmod.c b/osfmk/kern/kmod.c index 9d88531f2..f30d897e2 100644 --- a/osfmk/kern/kmod.c +++ b/osfmk/kern/kmod.c @@ -1209,7 +1209,7 @@ kmod_free_linkedit_data(void) round_page_32(dt_symtab_size)); } - PE_parse_boot_arg("keepsyms", &keepsyms); + PE_parse_boot_argn("keepsyms", &keepsyms, sizeof (keepsyms)); segmentLE = getsegbyname(segment_name); if (!segmentLE) { diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index 5f9d4d80a..5718455f9 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -341,14 +341,13 @@ lck_attr_setdefault( lck_attr_t *attr) { #if !DEBUG - if (LcksOpts & enaLkDeb) - attr->lck_attr_val = LCK_ATTR_DEBUG; - else - attr->lck_attr_val = LCK_ATTR_NONE; + if (LcksOpts & enaLkDeb) + attr->lck_attr_val = LCK_ATTR_DEBUG; + else + attr->lck_attr_val = LCK_ATTR_NONE; #else - attr->lck_attr_val = LCK_ATTR_DEBUG; -#endif - + attr->lck_attr_val = LCK_ATTR_DEBUG; +#endif /* !DEBUG */ } diff --git a/osfmk/kern/machine.h b/osfmk/kern/machine.h index 1d3e4108d..106a9b41d 100644 --- a/osfmk/kern/machine.h +++ b/osfmk/kern/machine.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,7 +46,7 @@ extern void processor_up( extern void processor_offline( processor_t processor); -extern void processor_start_thread(void); +extern void processor_start_thread(void *machine_param); /* * Must be implemented in machine dependent code. diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 84721990d..6d2ceb898 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -124,6 +124,8 @@ extern int kdb_printf(const char *format, ...) __printflike(1,2); extern void printf_init(void); +extern int snprintf(char *, size_t, const char *, ...) __printflike(3,4); + extern void log(int level, char *fmt, ...); void diff --git a/osfmk/kern/page_decrypt.c b/osfmk/kern/page_decrypt.c index f416dc105..f31c69908 100644 --- a/osfmk/kern/page_decrypt.c +++ b/osfmk/kern/page_decrypt.c @@ -64,7 +64,7 @@ dsmos_page_transform_hook(dsmos_page_transform_hook_t hook) } int -dsmos_page_transform(const void* from, void *to) +dsmos_page_transform(const void* from, void *to, __unused unsigned long long src_offset, __unused void *ops) { /* printf("%s\n", __FUNCTION__); */ if (dsmos_hook == NULL) @@ -72,3 +72,9 @@ dsmos_page_transform(const void* from, void *to) return (*dsmos_hook) (from, to); } + +text_crypter_create_hook_t text_crypter_create=NULL; +void text_crypter_create_hook_set(text_crypter_create_hook_t hook) +{ + text_crypter_create=hook; +}; diff --git a/osfmk/kern/page_decrypt.h b/osfmk/kern/page_decrypt.h index a0517a01f..f00202dff 100644 --- a/osfmk/kern/page_decrypt.h +++ b/osfmk/kern/page_decrypt.h @@ -29,10 +29,35 @@ #ifndef _KERN_PAGE_DECRYPT_H #define _KERN_PAGE_DECRYPT_H -typedef int (*dsmos_page_transform_hook_t) (const void *,void*); +/* + * Interface for DSMOS + */ +typedef int (*dsmos_page_transform_hook_t) (const void *,void*); extern void dsmos_page_transform_hook(dsmos_page_transform_hook_t hook); /* exported */ -extern int dsmos_page_transform(const void *,void*); +extern int dsmos_page_transform(const void *,void*, unsigned long long, void*); + + +/* + *Interface for text decryption family + */ +struct pager_crypt_info { + /* Decrypt one page */ + int (*page_decrypt)(const void *src_vaddr, void *dst_vaddr, + unsigned long long src_offset, void *crypt_ops); + /* Pager using this crypter terminates - crypt module not needed anymore */ + void (*crypt_end)(void *crypt_ops); + /* Private data for the crypter */ + void *crypt_ops; +}; +typedef struct pager_crypt_info pager_crypt_info_t; + +typedef int (*text_crypter_create_hook_t)(struct pager_crypt_info *crypt_info, + const char *id, void *crypt_data); +extern void text_crypter_create_hook_set(text_crypter_create_hook_t hook); +//extern kern_return_t text_crypter_create(pager_crypt_info_t *crypt_info, const char *id, +// void *crypt_data); +extern text_crypter_create_hook_t text_crypter_create; #endif /* _KERN_PAGE_DECRYPT_H */ diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index a922e3688..1f7015c87 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -745,7 +745,8 @@ conslog_putc( cnputc(c); #ifdef MACH_BSD - log_putc(c); + if (debug_mode == 0) + log_putc(c); #endif } diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 518891c79..0a413bcf1 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -886,6 +886,15 @@ processor_set_threads( { return KERN_FAILURE; } +#elif defined(CONFIG_EMBEDDED) +kern_return_t +processor_set_threads( + __unused processor_set_t pset, + __unused thread_array_t *thread_list, + __unused mach_msg_type_number_t *count) +{ + return KERN_NOT_SUPPORTED; +} #else kern_return_t processor_set_threads( diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index 3c132bb74..d0a496a07 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,6 +104,10 @@ #include #include +#if MACH_KDP +#include +#endif + #if CONFIG_MACF #include #endif @@ -265,6 +269,11 @@ kernel_bootstrap_thread(void) kth_started = 1; +#if MACH_KDP + kernel_bootstrap_kprintf("calling kdp_init\n"); + kdp_init(); +#endif + #ifdef i386 /* * Create and initialize the physical copy window for processor 0 @@ -329,7 +338,7 @@ kernel_bootstrap_thread(void) * Load the first thread to start a processor. */ void -slave_main(void) +slave_main(void *machine_param) { processor_t processor = current_processor(); thread_t thread; @@ -341,7 +350,7 @@ slave_main(void) if (processor->next_thread == THREAD_NULL) { thread = processor->idle_thread; thread->continuation = (thread_continue_t)processor_start_thread; - thread->parameter = NULL; + thread->parameter = machine_param; } else { thread = processor->next_thread; @@ -360,12 +369,12 @@ slave_main(void) * Called at splsched. */ void -processor_start_thread(void) +processor_start_thread(void *machine_param) { processor_t processor = current_processor(); thread_t self = current_thread(); - slave_machine_init(); + slave_machine_init(machine_param); /* * If running the idle processor thread, @@ -406,7 +415,7 @@ load_context( * to have reserved stack. */ load_context_kprintf("stack %x, stackptr %x\n", - thread->kernel_stack, thread->machine.kstackptr); + thread->kernel_stack, thread->machine.kstackptr); if (!thread->kernel_stack) { load_context_kprintf("calling stack_alloc_try\n"); if (!stack_alloc_try(thread)) diff --git a/osfmk/kern/startup.h b/osfmk/kern/startup.h index 475160ce8..bb60c7d40 100644 --- a/osfmk/kern/startup.h +++ b/osfmk/kern/startup.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,14 +47,14 @@ extern void kernel_bootstrap(void) __attribute__((section("__TEXT, initcode"))); /* Initialize machine dependent stuff */ extern void machine_init(void); -extern void slave_main(void); +extern void slave_main(void *machine_param); /* * The following must be implemented in machine dependent code. */ /* Slave cpu initialization */ -extern void slave_machine_init(void); +extern void slave_machine_init(void *machine_param); /* Device subystem initialization */ extern void device_service_create(void); diff --git a/osfmk/kern/symbols.c b/osfmk/kern/symbols.c index 0d1b3065b..3196fb4c2 100644 --- a/osfmk/kern/symbols.c +++ b/osfmk/kern/symbols.c @@ -193,7 +193,6 @@ syms_nameforaddr(vm_offset_t addr, vm_offset_t *ofs, kmod_info_t **km) return (NULL); } -int snprintf(char *, size_t, const char *, ...); /* Format the results of calling syms_nameforaddr into a single string. * The buffer must be at least 13 bytes long; 80 is recommended. diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index e3dae79d9..d0cdc4aaa 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -321,6 +321,8 @@ extern void task_set_64bit( extern void task_backing_store_privileged( task_t task); +extern int get_task_numactivethreads( + task_t task); /* Get number of activations in a task */ extern int get_task_numacts( task_t task); diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index bdb373604..e189edb2c 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -265,7 +265,6 @@ MACRO_END #define zone_sleep(zone) \ (void) lck_mtx_sleep(&(zone)->lock, 0, (event_t)(zone), THREAD_UNINT); -extern int snprintf(char *, size_t, const char *, ...) __printflike(3,4); #define lock_zone_init(zone) \ MACRO_BEGIN \ @@ -615,7 +614,7 @@ zone_bootstrap(void) char temp_buf[16]; /* see if we want freed zone element checking */ - if (PE_parse_boot_arg("-zc", temp_buf)) { + if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) { check_freed_element = 1; } diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index 00e9580ea..9aa0a75c7 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -85,7 +85,10 @@ #define HOST_USER_NOTIFICATION_PORT (3 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_LOCKD_PORT (5 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_SEATBELT_PORT (7 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT (8 + HOST_MAX_SPECIAL_KERNEL_PORT) + +#define HOST_UNFREED_PORT (10 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_AMFID_PORT (11 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_MAX_SPECIAL_PORT (12 + HOST_MAX_SPECIAL_KERNEL_PORT) /* room to grow here as well */ /* @@ -142,4 +145,16 @@ #define host_set_lockd_port(host, port) \ (host_set_special_port((host), HOST_LOCKD_PORT, (port))) +#define host_get_unfreed_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_UNFREED_PORT, (port))) +#define host_set_unfreed_port(host, port) \ + (host_set_special_port((host), HOST_UNFREED_PORT, (port))) + +#define host_get_amfid_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_AMFID_PORT, (port))) +#define host_set_amfid_port(host, port) \ + (host_set_special_port((host), HOST_AMFID_PORT, (port))) + #endif /* _MACH_HOST_SPECIAL_PORTS_H_ */ diff --git a/osfmk/mach/memory_object.defs b/osfmk/mach/memory_object.defs index 9fd7664f4..9bc85153a 100644 --- a/osfmk/mach/memory_object.defs +++ b/osfmk/mach/memory_object.defs @@ -197,7 +197,10 @@ routine memory_object_synchronize( * [Response should be a release of the named reference when * the pager deems that appropriate.] */ -routine memory_object_unmap( +routine memory_object_map( + memory_object : memory_object_t; + prot : vm_prot_t); +routine memory_object_last_unmap( memory_object : memory_object_t); /* vim: set ft=c : */ diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index 01b462b12..739f0374d 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -144,7 +144,10 @@ typedef const struct memory_object_pager_ops { memory_object_offset_t offset, vm_size_t size, vm_sync_t sync_flags); - kern_return_t (*memory_object_unmap)( + kern_return_t (*memory_object_map)( + memory_object_t mem_obj, + vm_prot_t prot); + kern_return_t (*memory_object_last_unmap)( memory_object_t mem_obj); const char *memory_object_pager_name; } * memory_object_pager_ops_t; @@ -386,15 +389,17 @@ struct upl_page_info { ppnum_t phys_addr; /* physical page index number */ unsigned int #ifdef XNU_KERNEL_PRIVATE - pageout:1, /* page is to be removed on commit */ - absent:1, /* No valid data in this page */ - dirty:1, /* Page must be cleaned (O) */ - precious:1, /* must be cleaned, we have only copy */ - device:1, /* no page data, mapped dev memory */ - speculative:1, /* page is valid, but not yet accessed */ - :0; /* force to long boundary */ + pageout:1, /* page is to be removed on commit */ + absent:1, /* No valid data in this page */ + dirty:1, /* Page must be cleaned (O) */ + precious:1, /* must be cleaned, we have only copy */ + device:1, /* no page data, mapped dev memory */ + speculative:1, /* page is valid, but not yet accessed */ + cs_validated:1, /* CODE SIGNING: page was validated */ + cs_tainted:1, /* CODE SIGNING: page is tainted */ + :0; /* force to long boundary */ #else - opaque; /* use upl_page_xxx() accessor funcs */ + opaque; /* use upl_page_xxx() accessor funcs */ #endif /* XNU_KERNEL_PRIVATE */ }; @@ -532,6 +537,9 @@ typedef uint32_t upl_size_t; /* page-aligned byte size */ #define UPL_COMMIT_INACTIVATE 0x8 #define UPL_COMMIT_NOTIFY_EMPTY 0x10 #define UPL_COMMIT_ALLOW_ACCESS 0x20 +#define UPL_COMMIT_CS_VALIDATED 0x40 + +#define UPL_COMMIT_KERNEL_ONLY_FLAGS (UPL_COMMIT_CS_VALIDATED) /* flags for return of state from vm_map_get_upl, vm_upl address space */ /* based call */ @@ -610,6 +618,14 @@ typedef uint32_t upl_size_t; /* page-aligned byte size */ (((upl)[(index)].phys_addr != 0) ? \ ((upl)[(index)].pageout = FALSE) : FALSE) +/* modifier macros for upl_t */ + +#define UPL_SET_CS_VALIDATED(upl, index, value) \ + ((upl)[(index)].cs_validated = ((value) ? TRUE : FALSE)) + +#define UPL_SET_CS_TAINTED(upl, index, value) \ + ((upl)[(index)].cs_tainted = ((value) ? TRUE : FALSE)) + /* The call prototyped below is used strictly by UPL_GET_INTERNAL_PAGE_LIST */ extern vm_size_t upl_offset_to_pagelist; diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index aab39fd3c..f0bdd1a47 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -109,6 +109,8 @@ typedef struct vm_statistics vm_statistics_data_t; #define VM_PAGE_QUERY_PAGE_PAGED_OUT 0x10 #define VM_PAGE_QUERY_PAGE_COPIED 0x20 #define VM_PAGE_QUERY_PAGE_SPECULATIVE 0x40 +#define VM_PAGE_QUERY_PAGE_CS_VALIDATED 0x100 +#define VM_PAGE_QUERY_PAGE_CS_TAINTED 0x200 #ifdef MACH_KERNEL_PRIVATE diff --git a/osfmk/ppc/machine_routines.c b/osfmk/ppc/machine_routines.c index 7f127ee81..9386f8597 100644 --- a/osfmk/ppc/machine_routines.c +++ b/osfmk/ppc/machine_routines.c @@ -754,7 +754,7 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(NSEC_PER_SEC>>2, &abstime); LockTimeOut = (unsigned int)abstime; - if (PE_parse_boot_arg("mtxspin", &mtxspin)) { + if (PE_parse_boot_argn("mtxspin", &mtxspin, sizeof (mtxspin))) { if (mtxspin > USEC_PER_SEC>>4) mtxspin = USEC_PER_SEC>>4; nanoseconds_to_absolutetime(mtxspin*NSEC_PER_USEC, &abstime); diff --git a/osfmk/ppc/model_dep.c b/osfmk/ppc/model_dep.c index 97f5276f2..a6dcb6577 100644 --- a/osfmk/ppc/model_dep.c +++ b/osfmk/ppc/model_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -229,7 +229,7 @@ machine_startup(void) int boot_arg; unsigned int wncpu; - if (PE_parse_boot_arg("cpus", &wncpu)) { + if (PE_parse_boot_argn("cpus", &wncpu, sizeof (wncpu))) { if ((wncpu > 0) && (wncpu < MAX_CPUS)) max_ncpus = wncpu; } @@ -237,7 +237,7 @@ machine_startup(void) if( PE_get_hotkey( kPEControlKey )) halt_in_debugger = halt_in_debugger ? 0 : 1; - if (PE_parse_boot_arg("debug", &boot_arg)) { + if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) { if (boot_arg & DB_HALT) halt_in_debugger=1; if (boot_arg & DB_PRT) disable_debug_output=FALSE; if (boot_arg & DB_SLOG) systemLogDiags=TRUE; @@ -245,10 +245,10 @@ machine_startup(void) if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; } - if (!PE_parse_boot_arg("nvram_paniclog", &commit_paniclog_to_nvram)) + if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram))) commit_paniclog_to_nvram = 1; - PE_parse_boot_arg("vmmforce", &lowGlo.lgVMMforcedFeats); + PE_parse_boot_argn("vmmforce", &lowGlo.lgVMMforcedFeats, sizeof (lowGlo.lgVMMforcedFeats)); hw_lock_init(&debugger_lock); /* initialize debugger lock */ hw_lock_init(&pbtlock); /* initialize print backtrace lock */ @@ -276,16 +276,16 @@ machine_startup(void) active_debugger =1; } #endif /* MACH_KDB */ - if (PE_parse_boot_arg("preempt", &boot_arg)) { + if (PE_parse_boot_argn("preempt", &boot_arg, sizeof (boot_arg))) { default_preemption_rate = boot_arg; } - if (PE_parse_boot_arg("unsafe", &boot_arg)) { + if (PE_parse_boot_argn("unsafe", &boot_arg, sizeof (boot_arg))) { max_unsafe_quanta = boot_arg; } - if (PE_parse_boot_arg("poll", &boot_arg)) { + if (PE_parse_boot_argn("poll", &boot_arg, sizeof (boot_arg))) { max_poll_quanta = boot_arg; } - if (PE_parse_boot_arg("yield", &boot_arg)) { + if (PE_parse_boot_argn("yield", &boot_arg, sizeof (boot_arg))) { sched_poll_yield_shift = boot_arg; } @@ -322,7 +322,8 @@ machine_init(void) } -void slave_machine_init(void) +void +slave_machine_init(__unused void *param) { cpu_machine_init(); /* Initialize the processor */ clock_init(); /* Init the clock */ diff --git a/osfmk/ppc/movc.s b/osfmk/ppc/movc.s index 1e111ec0c..2e100071b 100644 --- a/osfmk/ppc/movc.s +++ b/osfmk/ppc/movc.s @@ -629,7 +629,7 @@ copyJoin1: // enter from copyinstr with kkNull set crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor stw r7,kkCountPtr(r1) stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr - li r31,0 // no mapped ptr yet + // Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout. @@ -648,6 +648,7 @@ copyJoin1: // enter from copyinstr with kkNull set // Set up thread_recover in case we hit an illegal address. copyin0: + li r31,0 // no mapped ptr yet mfsprg r8,1 // Get the current thread lis r2,hi16(copyinout_error) ori r2,r2,lo16(copyinout_error) diff --git a/osfmk/ppc/ppc_init.c b/osfmk/ppc/ppc_init.c index c5629bf1b..35526ab2c 100644 --- a/osfmk/ppc/ppc_init.c +++ b/osfmk/ppc/ppc_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -201,28 +201,28 @@ ppc_init( PE_init_platform(FALSE, args); /* Get platform expert set up */ - if (!PE_parse_boot_arg("novmx", &novmx)) novmx=0; /* Special run without VMX? */ + if (!PE_parse_boot_argn("novmx", &novmx, sizeof (novmx))) novmx=0; /* Special run without VMX? */ if(novmx) { /* Yeah, turn it off */ BootProcInfo.pf.Available &= ~pfAltivec; /* Turn off Altivec available */ __asm__ volatile("mtsprg 2,%0" : : "r" (BootProcInfo.pf.Available)); /* Set live value */ } - if (!PE_parse_boot_arg("fn", &forcenap)) forcenap = 0; /* If force nap not set, make 0 */ + if (!PE_parse_boot_argn("fn", &forcenap, sizeof (forcenap))) forcenap = 0; /* If force nap not set, make 0 */ else { if(forcenap < 2) forcenap = forcenap + 1; /* Else set 1 for off, 2 for on */ else forcenap = 0; /* Clear for error case */ } - if (!PE_parse_boot_arg("pmsx", &pmsExperimental)) pmsExperimental = 0; /* Check if we should start in experimental power management stepper mode */ - if (!PE_parse_boot_arg("lcks", &LcksOpts)) LcksOpts = 0; /* Set lcks options */ - if (!PE_parse_boot_arg("diag", &dgWork.dgFlags)) dgWork.dgFlags = 0; /* Set diagnostic flags */ + if (!PE_parse_boot_argn("pmsx", &pmsExperimental, sizeof (pmsExperimental))) pmsExperimental = 0; /* Check if we should start in experimental power management stepper mode */ + if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts))) LcksOpts = 0; /* Set lcks options */ + if (!PE_parse_boot_argn("diag", &dgWork.dgFlags, sizeof (dgWork.dgFlags))) dgWork.dgFlags = 0; /* Set diagnostic flags */ if(dgWork.dgFlags & enaExpTrace) trcWork.traceMask = 0xFFFFFFFF; /* If tracing requested, enable it */ - if(PE_parse_boot_arg("ctrc", &cputrace)) { /* See if tracing is limited to a specific cpu */ + if(PE_parse_boot_argn("ctrc", &cputrace, sizeof (cputrace))) { /* See if tracing is limited to a specific cpu */ trcWork.traceMask = (trcWork.traceMask & 0xFFFFFFF0) | (cputrace & 0xF); /* Limit to 4 */ } - if(!PE_parse_boot_arg("tb", &trcWork.traceSize)) { /* See if non-default trace buffer size */ + if(!PE_parse_boot_argn("tb", &trcWork.traceSize, sizeof (trcWork.traceSize))) { /* See if non-default trace buffer size */ #if DEBUG trcWork.traceSize = 32; /* Default 32 page trace table for DEBUG */ #else @@ -234,18 +234,18 @@ ppc_init( if(trcWork.traceSize > 256) trcWork.traceSize = 256; /* Maximum size of 256 pages */ trcWork.traceSize = trcWork.traceSize * 4096; /* Change page count to size */ - if (!PE_parse_boot_arg("maxmem", &maxmem)) + if (!PE_parse_boot_argn("maxmem", &maxmem, sizeof (maxmem))) xmaxmem=0; else xmaxmem = (uint64_t)maxmem * (1024 * 1024); - if (!PE_parse_boot_arg("wcte", &wcte)) wcte = 0; /* If write combine timer enable not supplied, make 1 */ + if (!PE_parse_boot_argn("wcte", &wcte, sizeof (wcte))) wcte = 0; /* If write combine timer enable not supplied, make 1 */ else wcte = (wcte != 0); /* Force to 0 or 1 */ - if (!PE_parse_boot_arg("mcklog", &mckFlags)) mckFlags = 0; /* If machine check flags not specified, clear */ + if (!PE_parse_boot_argn("mcklog", &mckFlags, sizeof (mckFlags))) mckFlags = 0; /* If machine check flags not specified, clear */ else if(mckFlags > 1) mckFlags = 0; /* If bogus, clear */ - if (!PE_parse_boot_arg("ht_shift", &hash_table_shift)) /* should we use a non-default hash table size? */ + if (!PE_parse_boot_argn("ht_shift", &hash_table_shift, sizeof (hash_table_shift))) /* should we use a non-default hash table size? */ hash_table_shift = 0; /* no, use default size */ /* @@ -262,7 +262,7 @@ ppc_init( (void)ml_scom_write(GUSModeReg << 8, scdata); /* Get GUS mode register */ } - if(PE_parse_boot_arg("mcksoft", &mcksoft)) { /* Have they supplied "machine check software recovery? */ + if(PE_parse_boot_argn("mcksoft", &mcksoft, sizeof (mcksoft))) { /* Have they supplied "machine check software recovery? */ newhid = BootProcInfo.pf.pfHID5; /* Get the old HID5 */ if(mcksoft < 2) { newhid &= 0xFFFFFFFFFFFFDFFFULL; /* Clear the old one */ @@ -296,5 +296,5 @@ ppc_init_cpu( cpu_init(); - slave_main(); + slave_main(NULL); } diff --git a/osfmk/ppc/ppc_vm_init.c b/osfmk/ppc/ppc_vm_init.c index 49d177d80..0ff41cf4d 100644 --- a/osfmk/ppc/ppc_vm_init.c +++ b/osfmk/ppc/ppc_vm_init.c @@ -354,7 +354,7 @@ void ppc_vm_init(uint64_t mem_limit, boot_args *args) kprintf("kprintf initialized\n"); serialmode = 0; /* Assume normal keyboard and console */ - if(PE_parse_boot_arg("serial", &serialmode)) { /* Do we want a serial keyboard and/or console? */ + if(PE_parse_boot_argn("serial", &serialmode, sizeof (serialmode))) { /* Do we want a serial keyboard and/or console? */ kprintf("Serial mode specified: %08X\n", serialmode); } if(serialmode & 1) { /* Start serial if requested */ diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index e9af2b6ee..fd383fea3 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -126,7 +126,8 @@ const struct memory_object_pager_ops vnode_pager_ops = { vnode_pager_data_initialize, vnode_pager_data_unlock, vnode_pager_synchronize, - vnode_pager_unmap, + vnode_pager_map, + vnode_pager_last_unmap, "vnode pager" }; @@ -494,9 +495,9 @@ vnode_pager_bootstrap(void) size = (vm_size_t) sizeof(struct vnode_pager); vnode_pager_zone = zinit(size, (vm_size_t) MAX_VNODE*size, PAGE_SIZE, "vnode pager structures"); -#ifdef __i386__ +#if CONFIG_CODE_DECRYPTION apple_protect_pager_bootstrap(); -#endif /* __i386__ */ +#endif /* CONFIG_CODE_DECRYPTION */ return; } @@ -782,12 +783,36 @@ vnode_pager_synchronize( * */ kern_return_t -vnode_pager_unmap( +vnode_pager_map( + memory_object_t mem_obj, + vm_prot_t prot) +{ + vnode_pager_t vnode_object; + int ret; + kern_return_t kr; + + PAGER_DEBUG(PAGER_ALL, ("vnode_pager_map: %p %x\n", mem_obj, prot)); + + vnode_object = vnode_pager_lookup(mem_obj); + + ret = ubc_map(vnode_object->vnode_handle, prot); + + if (ret != 0) { + kr = KERN_FAILURE; + } else { + kr = KERN_SUCCESS; + } + + return kr; +} + +kern_return_t +vnode_pager_last_unmap( memory_object_t mem_obj) { register vnode_pager_t vnode_object; - PAGER_DEBUG(PAGER_ALL, ("vnode_pager_unmap: %p\n", mem_obj)); + PAGER_DEBUG(PAGER_ALL, ("vnode_pager_last_unmap: %p\n", mem_obj)); vnode_object = vnode_pager_lookup(mem_obj); diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 015200a6a..4f32ac723 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -73,7 +73,8 @@ const struct memory_object_pager_ops device_pager_ops = { device_pager_data_initialize, device_pager_data_unlock, device_pager_synchronize, - device_pager_unmap, + device_pager_map, + device_pager_last_unmap, "device pager" }; @@ -424,7 +425,15 @@ device_pager_synchronize( * */ kern_return_t -device_pager_unmap( +device_pager_map( + __unused memory_object_t mem_obj, + __unused vm_prot_t prot) +{ + return KERN_SUCCESS; +} + +kern_return_t +device_pager_last_unmap( __unused memory_object_t mem_obj) { return KERN_SUCCESS; diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index c84e776c6..f83dd5e88 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -2147,13 +2147,44 @@ kern_return_t memory_object_synchronize sync_flags); } -/* Routine memory_object_unmap */ -kern_return_t memory_object_unmap + +/* + * memory_object_map() is called by VM (in vm_map_enter() and its variants) + * each time a "named" VM object gets mapped directly or indirectly + * (copy-on-write mapping). A "named" VM object has an extra reference held + * by the pager to keep it alive until the pager decides that the + * memory object (and its VM object) can be reclaimed. + * VM calls memory_object_last_unmap() (in vm_object_deallocate()) when all + * the mappings of that memory object have been removed. + * + * For a given VM object, calls to memory_object_map() and memory_object_unmap() + * are serialized (through object->mapping_in_progress), to ensure that the + * pager gets a consistent view of the mapping status of the memory object. + * + * This allows the pager to keep track of how many times a memory object + * has been mapped and with which protections, to decide when it can be + * reclaimed. + */ + +/* Routine memory_object_map */ +kern_return_t memory_object_map +( + memory_object_t memory_object, + vm_prot_t prot +) +{ + return (memory_object->mo_pager_ops->memory_object_map)( + memory_object, + prot); +} + +/* Routine memory_object_last_unmap */ +kern_return_t memory_object_last_unmap ( memory_object_t memory_object ) { - return (memory_object->mo_pager_ops->memory_object_unmap)( + return (memory_object->mo_pager_ops->memory_object_last_unmap)( memory_object); } diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 5045c1b15..54e618e40 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -110,7 +110,9 @@ kern_return_t apple_protect_pager_synchronize(memory_object_t mem_obj, memory_object_offset_t offset, vm_size_t length, vm_sync_t sync_flags); -kern_return_t apple_protect_pager_unmap(memory_object_t mem_obj); +kern_return_t apple_protect_pager_map(memory_object_t mem_obj, + vm_prot_t prot); +kern_return_t apple_protect_pager_last_unmap(memory_object_t mem_obj); /* * Vector of VM operations for this EMM. @@ -126,7 +128,8 @@ const struct memory_object_pager_ops apple_protect_pager_ops = { apple_protect_pager_data_initialize, apple_protect_pager_data_unlock, apple_protect_pager_synchronize, - apple_protect_pager_unmap, + apple_protect_pager_map, + apple_protect_pager_last_unmap, "apple protect pager" }; @@ -143,6 +146,7 @@ typedef struct apple_protect_pager { boolean_t is_mapped; /* is this mem_obj mapped ? */ memory_object_control_t pager_control; /* mem object control handle */ vm_object_t backing_object; /* VM obj w/ encrypted data */ + struct pager_crypt_info crypt; } *apple_protect_pager_t; #define APPLE_PROTECT_PAGER_NULL ((apple_protect_pager_t) NULL) @@ -169,7 +173,8 @@ int apple_protect_pager_num_trim_max = 0; int apple_protect_pager_num_trim_total = 0; /* internal prototypes */ -apple_protect_pager_t apple_protect_pager_create(vm_object_t backing_object); +apple_protect_pager_t apple_protect_pager_create(vm_object_t backing_object, + struct pager_crypt_info *crypt_info); apple_protect_pager_t apple_protect_pager_lookup(memory_object_t mem_obj); void apple_protect_pager_dequeue(apple_protect_pager_t pager); void apple_protect_pager_deallocate_internal(apple_protect_pager_t pager, @@ -315,7 +320,8 @@ apple_protect_pager_data_request( upl_t upl; int upl_flags; upl_size_t upl_size; - upl_page_info_t *upl_pl; + upl_page_info_t *upl_pl = NULL; + unsigned int pl_count; vm_object_t src_object, dst_object; kern_return_t kr, retval; vm_map_offset_t kernel_mapping; @@ -333,6 +339,7 @@ apple_protect_pager_data_request( src_object = VM_OBJECT_NULL; kernel_mapping = 0; upl = NULL; + upl_pl = NULL; fault_info = (vm_object_fault_info_t) mo_fault_info; interruptible = fault_info->interruptible; @@ -354,6 +361,7 @@ apple_protect_pager_data_request( UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | /* triggers UPL_CLEAR_DIRTY */ UPL_SET_INTERNAL; + pl_count = 0; kr = memory_object_upl_request(mo_control, offset, upl_size, &upl, NULL, NULL, upl_flags); @@ -401,6 +409,7 @@ apple_protect_pager_data_request( * Fill in the contents of the pages requested by VM. */ upl_pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + pl_count = length / PAGE_SIZE; for (cur_offset = 0; cur_offset < length; cur_offset += PAGE_SIZE) { ppnum_t dst_pnum; @@ -485,13 +494,29 @@ apple_protect_pager_data_request( dst_object->wimg_bits & VM_WIMG_MASK, TRUE); + /* + * Validate the original page... + */ + if (src_page->object->code_signed) { + vm_page_validate_cs_mapped(src_page, + (const void *) src_vaddr); + } + /* + * ... and transfer the results to the destination page. + */ + UPL_SET_CS_VALIDATED(upl_pl, cur_offset / PAGE_SIZE, + src_page->cs_validated); + UPL_SET_CS_TAINTED(upl_pl, cur_offset / PAGE_SIZE, + src_page->cs_tainted); + /* * Decrypt the encrypted contents of the source page * into the destination page. */ - dsmos_page_transform((const void *) src_vaddr, - (void *) dst_vaddr); - + pager->crypt.page_decrypt((const void *) src_vaddr, + (void *) dst_vaddr, offset+cur_offset, + pager->crypt.crypt_ops); + /* * Remove the pmap mapping of the source and destination pages * in the kernel. @@ -535,7 +560,10 @@ apple_protect_pager_data_request( if (retval != KERN_SUCCESS) { upl_abort(upl, 0); } else { - upl_commit(upl, NULL, 0); + boolean_t empty; + upl_commit_range(upl, 0, upl->size, + UPL_COMMIT_CS_VALIDATED, + upl_pl, pl_count, &empty); } /* and deallocate the UPL */ @@ -632,6 +660,10 @@ apple_protect_pager_terminate_internal( /* trigger the destruction of the memory object */ memory_object_destroy(pager->pager_control, 0); + + /* deallocate any crypt module data */ + if(pager->crypt.crypt_end) + pager->crypt.crypt_end(pager->crypt.crypt_ops); } /* @@ -762,9 +794,10 @@ apple_protect_pager_synchronize( * time the memory object gets mapped and we take one extra reference on the * memory object to account for all its mappings. */ -void +kern_return_t apple_protect_pager_map( - memory_object_t mem_obj) + memory_object_t mem_obj, + __unused vm_prot_t prot) { apple_protect_pager_t pager; @@ -786,21 +819,24 @@ apple_protect_pager_map( apple_protect_pager_count_mapped++; } mutex_unlock(&apple_protect_pager_lock); + + return KERN_SUCCESS; } /* - * apple_protect_pager_unmap() + * apple_protect_pager_last_unmap() * * This is called by VM when this memory object is no longer mapped anywhere. */ kern_return_t -apple_protect_pager_unmap( +apple_protect_pager_last_unmap( memory_object_t mem_obj) { apple_protect_pager_t pager; int count_unmapped; - PAGER_DEBUG(PAGER_ALL, ("apple_protect_pager_unmap: %p\n", mem_obj)); + PAGER_DEBUG(PAGER_ALL, + ("apple_protect_pager_last_unmap: %p\n", mem_obj)); pager = apple_protect_pager_lookup(mem_obj); @@ -844,7 +880,8 @@ apple_protect_pager_lookup( apple_protect_pager_t apple_protect_pager_create( - vm_object_t backing_object) + vm_object_t backing_object, + struct pager_crypt_info *crypt_info) { apple_protect_pager_t pager, pager2; memory_object_control_t control; @@ -869,6 +906,8 @@ apple_protect_pager_create( pager->is_mapped = FALSE; pager->pager_control = MEMORY_OBJECT_CONTROL_NULL; pager->backing_object = backing_object; + pager->crypt = *crypt_info; + vm_object_reference(backing_object); mutex_lock(&apple_protect_pager_lock); @@ -932,7 +971,8 @@ apple_protect_pager_create( */ memory_object_t apple_protect_pager_setup( - vm_object_t backing_object) + vm_object_t backing_object, + struct pager_crypt_info *crypt_info) { apple_protect_pager_t pager; @@ -943,6 +983,12 @@ apple_protect_pager_setup( apple_protect_pager_t, pager_queue) { if (pager->backing_object == backing_object) { + /* For the same object we must always use the same protection options */ + if (!((pager->crypt.page_decrypt == crypt_info->page_decrypt) && + (pager->crypt.crypt_ops == crypt_info->crypt_ops) )) { + mutex_unlock(&apple_protect_pager_lock); + return MEMORY_OBJECT_NULL; + } break; } } @@ -958,7 +1004,7 @@ apple_protect_pager_setup( mutex_unlock(&apple_protect_pager_lock); if (pager == APPLE_PROTECT_PAGER_NULL) { - pager = apple_protect_pager_create(backing_object); + pager = apple_protect_pager_create(backing_object, crypt_info); if (pager == APPLE_PROTECT_PAGER_NULL) { return MEMORY_OBJECT_NULL; } diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index f5275261d..77a34c912 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -155,6 +155,14 @@ unsigned long vm_cs_revalidates = 0; unsigned long vm_cs_query_modified = 0; unsigned long vm_cs_validated_dirtied = 0; +#if CONFIG_ENFORCE_SIGNED_CODE +#if SECURE_KERNEL +const int cs_enforcement_disable=0; +#else +int cs_enforcement_disable=1; +#endif +#endif + /* * Routine: vm_fault_init * Purpose: @@ -163,6 +171,12 @@ unsigned long vm_cs_validated_dirtied = 0; void vm_fault_init(void) { +#if !SECURE_KERNEL +#if CONFIG_ENFORCE_SIGNED_CODE + PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable)); +#endif + PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug)); +#endif } /* @@ -1958,6 +1972,21 @@ vm_fault_page( +/* + * CODE SIGNING: + * When soft faulting a page, we have to validate the page if: + * 1. the page is being mapped in user space + * 2. the page hasn't already been found to be "tainted" + * 3. the page belongs to a code-signed object + * 4. the page has not been validated yet or has been mapped for write. + */ +#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ + ((pmap) != kernel_pmap /*1*/ && \ + !(page)->cs_tainted /*2*/ && \ + (page)->object->code_signed /*3*/ && \ + (!(page)->cs_validated || (page)->wpmapped /*4*/)) + + /* * page queue lock must NOT be held * m->object must be locked @@ -1995,24 +2024,6 @@ vm_fault_enter(vm_page_t m, cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; - if (m->object->code_signed && pmap != kernel_pmap && - (!m->cs_validated || m->wpmapped)) { - vm_object_lock_assert_exclusive(m->object); - - if (m->cs_validated && m->wpmapped) { - vm_cs_revalidates++; - } - - /* - * CODE SIGNING: - * This page comes from a VM object backed by a signed - * memory object. We are about to enter it into a process - * address space, so we need to validate its signature. - */ - /* VM map is locked, so 1 ref will remain on VM object */ - vm_page_validate_cs(m); - } - if (m->pmapped == FALSE) { /* * This is the first time this page is being @@ -2058,7 +2069,26 @@ vm_fault_enter(vm_page_t m, } } - if (m->cs_tainted) { + if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { + vm_object_lock_assert_exclusive(m->object); + + if (m->cs_validated) { + vm_cs_revalidates++; + } + + /* VM map is locked, so 1 ref will remain on VM object */ + vm_page_validate_cs(m); + } + + if (m->cs_tainted /* always invalidate a tainted page */ +#if CONFIG_ENFORCE_SIGNED_CODE + /* + * Code Signing enforcement invalidates an executable page that + * has no code directory, and thus could not be validated. + */ + || ((prot & VM_PROT_EXECUTE) && !m->cs_validated ) +#endif + ) { /* * CODE SIGNING: * This page has been tainted and can not be trusted. @@ -2066,18 +2096,25 @@ vm_fault_enter(vm_page_t m, * necessary precautions before we enter the tainted page * into its address space. */ - if (cs_invalid_page()) { - /* reject the tainted page: abort the page fault */ - kr = KERN_MEMORY_ERROR; - cs_enter_tainted_rejected++; - } else { - /* proceed with the tainted page */ - kr = KERN_SUCCESS; - cs_enter_tainted_accepted++; + kr = KERN_SUCCESS; +#if CONFIG_ENFORCE_SIGNED_CODE + if (!cs_enforcement_disable) { +#endif + if (cs_invalid_page((addr64_t) vaddr)) { + /* reject the tainted page: abort the page fault */ + kr = KERN_MEMORY_ERROR; + cs_enter_tainted_rejected++; + } else { + /* proceed with the tainted page */ + kr = KERN_SUCCESS; + cs_enter_tainted_accepted++; + } +#if CONFIG_ENFORCE_SIGNED_CODE } +#endif if (cs_debug || kr != KERN_SUCCESS) { printf("CODESIGNING: vm_fault_enter(0x%llx): " - "page %p obj %p off 0x%llx *** TAINTED ***\n", + "page %p obj %p off 0x%llx *** INVALID PAGE ***\n", (long long)vaddr, m, m->object, m->offset); } } else { @@ -2092,7 +2129,7 @@ vm_fault_enter(vm_page_t m, * since this is the ONLY bit updated behind the SHARED * lock... however, we need to figure out how to do an atomic * update on a bit field to make this less fragile... right - * now I don'w know how to coerce 'C' to give me the offset info + * now I don't know how to coerce 'C' to give me the offset info * that's needed for an AtomicCompareAndSwap */ m->pmapped = TRUE; @@ -2512,8 +2549,7 @@ vm_fault( } ASSERT_PAGE_DECRYPTED(m); - if (m->object->code_signed && map != kernel_map && - (!m->cs_validated || m->wpmapped)) { + if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { /* * We might need to validate this page * against its code signature, so we @@ -3431,11 +3467,11 @@ vm_fault_unwire( for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { - if (pmap) { - pmap_change_wiring(pmap, - pmap_addr + (va - entry->vme_start), FALSE); - } if (object == VM_OBJECT_NULL) { + if (pmap) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT, pmap, pmap_addr); } else { @@ -3483,6 +3519,10 @@ vm_fault_unwire( result_object = result_page->object; + if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } if (deallocate) { assert(result_page->phys_page != vm_page_fictitious_addr); @@ -4130,6 +4170,89 @@ vm_fault_classify_init(void) extern int cs_validation; +void +vm_page_validate_cs_mapped( + vm_page_t page, + const void *kaddr) +{ + vm_object_t object; + vm_object_offset_t offset; + kern_return_t kr; + memory_object_t pager; + void *blobs; + boolean_t validated, tainted; + + assert(page->busy); + vm_object_lock_assert_exclusive(page->object); + + if (!cs_validation) { + return; + } + + if (page->wpmapped && !page->cs_tainted) { + /* + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] + */ + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + vm_cs_validated_dirtied++; + } + + if (page->cs_validated) { + return; + } + + vm_cs_validates++; + + object = page->object; + assert(object->code_signed); + offset = page->offset; + + if (!object->alive || object->terminating || object->pager == NULL) { + /* + * The object is terminating and we don't have its pager + * so we can't validate the data... + */ + return; + } + /* + * Since we get here to validate a page that was brought in by + * the pager, we know that this pager is all setup and ready + * by now. + */ + assert(!object->internal); + assert(object->pager != NULL); + assert(object->pager_ready); + + pager = object->pager; + + kr = vnode_pager_get_object_cs_blobs(pager, &blobs); + if (kr != KERN_SUCCESS) { + blobs = NULL; + } + + /* verify the SHA1 hash for this page */ + validated = cs_validate_page(blobs, + offset + object->paging_offset, + (const void *)kaddr, + &tainted); + + page->cs_validated = validated; + if (validated) { + page->cs_tainted = tainted; + } +} + void vm_page_validate_cs( vm_page_t page) @@ -4140,9 +4263,6 @@ vm_page_validate_cs( vm_map_size_t ksize; vm_offset_t kaddr; kern_return_t kr; - memory_object_t pager; - void *blobs; - boolean_t validated, tainted; boolean_t busy_page; vm_object_lock_assert_held(page->object); @@ -4151,35 +4271,25 @@ vm_page_validate_cs( return; } - if (page->cs_validated && !page->cs_tainted && page->wpmapped) { + if (page->wpmapped && !page->cs_tainted) { vm_object_lock_assert_exclusive(page->object); /* - * This page has already been validated and found to - * be valid. However, it was mapped for "write" access - * sometime in the past, so we have to check if it was - * modified. If so, it needs to be revalidated. - * If the page was already found to be "tainted", no - * need to re-validate. + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] */ - if (!page->dirty) { - vm_cs_query_modified++; - page->dirty = pmap_is_modified(page->phys_page); - } - if (page->dirty) { - /* - * The page is dirty, so let's clear its - * "validated" bit and re-validate it. - */ - if (cs_debug) { - printf("CODESIGNING: vm_page_validate_cs: " - "page %p obj %p off 0x%llx " - "was modified\n", - page, page->object, page->offset); - } - page->cs_validated = FALSE; - vm_cs_validated_dirtied++; + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); } + vm_cs_validated_dirtied++; } if (page->cs_validated) { @@ -4188,8 +4298,6 @@ vm_page_validate_cs( vm_object_lock_assert_exclusive(page->object); - vm_cs_validates++; - object = page->object; assert(object->code_signed); offset = page->offset; @@ -4215,53 +4323,20 @@ vm_page_validate_cs( object, offset, &ksize, + VM_PROT_READ, FALSE); /* can't unlock object ! */ if (kr != KERN_SUCCESS) { panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); } kaddr = CAST_DOWN(vm_offset_t, koffset); - /* - * Since we get here to validate a page that was brought in by - * the pager, we know that this pager is all setup and ready - * by now. - */ - assert(!object->internal); - assert(object->pager != NULL); - assert(object->pager_ready); - - if (!object->alive || object->terminating || object->pager == NULL) { - /* - * The object is terminating and we don't have its pager - * so we can't validate the data... - */ - goto out; - } - - pager = object->pager; - assert(pager != NULL); - - kr = vnode_pager_get_object_cs_blobs(pager, &blobs); - if (kr != KERN_SUCCESS) { - blobs = NULL; - } - - /* verify the SHA1 hash for this page */ - validated = cs_validate_page(blobs, - offset + object->paging_offset, - (const void *)kaddr, - &tainted); + /* validate the mapped page */ + vm_page_validate_cs_mapped(page, (const void *) kaddr); assert(page->busy); assert(object == page->object); vm_object_lock_assert_exclusive(object); - page->cs_validated = validated; - if (validated) { - page->cs_tainted = tainted; - } - -out: if (!busy_page) { PAGE_WAKEUP_DONE(page); } diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 11927893c..7b6b17dc6 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -131,7 +131,7 @@ vm_mem_bootstrap(void) vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling pmap_init\n")); pmap_init(); - if (PE_parse_boot_arg("zsize", &zsizearg)) + if (PE_parse_boot_argn("zsize", &zsizearg, sizeof (zsizearg))) zsize = zsizearg * 1024ULL * 1024ULL; else { zsize = sane_size >> 2; /* Get target zone size as 1/4 of physical memory */ diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 74e805b79..b4a2e5cf1 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -399,28 +399,6 @@ static zone_t vm_map_copy_zone; /* zone for vm_map_copy structures */ vm_object_t vm_submap_object; -/* - * vm_map_init: - * - * Initialize the vm_map module. Must be called before - * any other vm_map routines. - * - * Map and entry structures are allocated from zones -- we must - * initialize those zones. - * - * There are three zones of interest: - * - * vm_map_zone: used to allocate maps. - * vm_map_entry_zone: used to allocate map entries. - * vm_map_kentry_zone: used to allocate map entries for the kernel. - * - * The kernel allocates map entries from a special zone that is initially - * "crammed" with memory. It would be difficult (perhaps impossible) for - * the kernel to allocate more memory to a entry zone when it became - * empty since the very act of allocating memory implies the creation - * of a new entry. - */ - static void *map_data; static vm_map_size_t map_data_size; static void *kentry_data; @@ -433,12 +411,21 @@ static int kentry_count = 2048; /* to init kentry_data_size */ /* Skip acquiring locks if we're in the midst of a kernel core dump */ extern unsigned int not_in_kdp; -#ifdef __i386__ +#if CONFIG_CODE_DECRYPTION +/* + * vm_map_apple_protected: + * This remaps the requested part of the object with an object backed by + * the decrypting pager. + * crypt_info contains entry points and session data for the crypt module. + * The crypt_info block will be copied by vm_map_apple_protected. The data structures + * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called. + */ kern_return_t vm_map_apple_protected( vm_map_t map, vm_map_offset_t start, - vm_map_offset_t end) + vm_map_offset_t end, + struct pager_crypt_info *crypt_info) { boolean_t map_locked; kern_return_t kr; @@ -454,7 +441,7 @@ vm_map_apple_protected( if (!vm_map_lookup_entry(map, start, &map_entry) || - map_entry->vme_end != end || + map_entry->vme_end < end || map_entry->is_sub_map) { /* that memory is not properly mapped */ kr = KERN_INVALID_ARGUMENT; @@ -475,7 +462,7 @@ vm_map_apple_protected( * it. */ - protected_mem_obj = apple_protect_pager_setup(protected_object); + protected_mem_obj = apple_protect_pager_setup(protected_object, crypt_info); if (protected_mem_obj == NULL) { kr = KERN_FAILURE; goto done; @@ -499,10 +486,6 @@ vm_map_apple_protected( map_entry->max_protection, map_entry->inheritance); assert(map_addr == start); - if (kr == KERN_SUCCESS) { - /* let the pager know that this mem_obj is mapped */ - apple_protect_pager_map(protected_mem_obj); - } /* * Release the reference obtained by apple_protect_pager_setup(). * The mapping (if it succeeded) is now holding a reference on the @@ -516,9 +499,30 @@ vm_map_apple_protected( } return kr; } -#endif /* __i386__ */ +#endif /* CONFIG_CODE_DECRYPTION */ +/* + * vm_map_init: + * + * Initialize the vm_map module. Must be called before + * any other vm_map routines. + * + * Map and entry structures are allocated from zones -- we must + * initialize those zones. + * + * There are three zones of interest: + * + * vm_map_zone: used to allocate maps. + * vm_map_entry_zone: used to allocate map entries. + * vm_map_kentry_zone: used to allocate map entries for the kernel. + * + * The kernel allocates map entries from a special zone that is initially + * "crammed" with memory. It would be difficult (perhaps impossible) for + * the kernel to allocate more memory to a entry zone when it became + * empty since the very act of allocating memory implies the creation + * of a new entry. + */ void vm_map_init( void) @@ -612,6 +616,11 @@ vm_map_create( result->wiring_required = FALSE; result->no_zero_fill = FALSE; result->mapped = FALSE; +#if CONFIG_EMBEDDED + result->prot_copy_allow = FALSE; +#else + result->prot_copy_allow = TRUE; +#endif result->wait_for_space = FALSE; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); @@ -1494,9 +1503,9 @@ static unsigned int vm_map_enter_restore_failures = 0; kern_return_t vm_map_enter( vm_map_t map, - vm_map_offset_t *address, /* IN/OUT */ + vm_map_offset_t *address, /* IN/OUT */ vm_map_size_t size, - vm_map_offset_t mask, + vm_map_offset_t mask, int flags, vm_object_t object, vm_object_offset_t offset, @@ -1521,6 +1530,32 @@ vm_map_enter( boolean_t is_submap = ((flags & VM_FLAGS_SUBMAP) != 0); char alias; vm_map_offset_t effective_min_offset, effective_max_offset; + kern_return_t kr; + +#if CONFIG_EMBEDDED + if (cur_protection & VM_PROT_WRITE) { + if (cur_protection & VM_PROT_EXECUTE) { + printf("EMBEDDED: %s curprot cannot be write+execute. turning off execute\n", __PRETTY_FUNCTION__); + cur_protection &= ~VM_PROT_EXECUTE; + } + } + if (max_protection & VM_PROT_WRITE) { + if (max_protection & VM_PROT_EXECUTE) { + /* Right now all kinds of data segments are RWX. No point in logging that. */ + /* printf("EMBEDDED: %s maxprot cannot be write+execute. turning off execute\n", __PRETTY_FUNCTION__); */ + + /* Try to take a hint from curprot. If curprot is not writable, + * make maxprot not writable. Otherwise make it not executable. + */ + if((cur_protection & VM_PROT_WRITE) == 0) { + max_protection &= ~VM_PROT_WRITE; + } else { + max_protection &= ~VM_PROT_EXECUTE; + } + } + } + assert ((cur_protection | max_protection) == max_protection); +#endif /* CONFIG_EMBEDDED */ if (is_submap) { if (purgable) { @@ -1925,8 +1960,6 @@ StartAgain: ; } } if (use_pmap && submap->pmap != NULL) { - kern_return_t kr; - kr = pmap_nest(map->pmap, submap->pmap, tmp_start, @@ -1983,13 +2016,56 @@ StartAgain: ; } BailOut: ; - if (result == KERN_SUCCESS && - pmap_empty && - !(flags & VM_FLAGS_NO_PMAP_CHECK)) { - assert(vm_map_pmap_is_empty(map, *address, *address+size)); - } + if (result == KERN_SUCCESS) { + vm_prot_t pager_prot; + memory_object_t pager; - if (result != KERN_SUCCESS) { + if (pmap_empty && + !(flags & VM_FLAGS_NO_PMAP_CHECK)) { + assert(vm_map_pmap_is_empty(map, + *address, + *address+size)); + } + + /* + * For "named" VM objects, let the pager know that the + * memory object is being mapped. Some pagers need to keep + * track of this, to know when they can reclaim the memory + * object, for example. + * VM calls memory_object_map() for each mapping (specifying + * the protection of each mapping) and calls + * memory_object_last_unmap() when all the mappings are gone. + */ + pager_prot = max_protection; + if (needs_copy) { + /* + * Copy-On-Write mapping: won't modify + * the memory object. + */ + pager_prot &= ~VM_PROT_WRITE; + } + if (!is_submap && + object != VM_OBJECT_NULL && + object->named && + object->pager != MEMORY_OBJECT_NULL) { + vm_object_lock(object); + pager = object->pager; + if (object->named && + pager != MEMORY_OBJECT_NULL) { + assert(object->pager_ready); + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); + vm_object_unlock(object); + + kr = memory_object_map(pager, pager_prot); + assert(kr == KERN_SUCCESS); + + vm_object_lock(object); + vm_object_mapping_end(object); + } + vm_object_unlock(object); + } + } else { if (new_mapping_established) { /* * We have to get rid of the new mappings since we @@ -2120,7 +2196,7 @@ vm_map_enter_mem_object( map_addr = vm_map_trunc_page(*address); map_size = vm_map_round_page(initial_size); size = vm_object_round_page(initial_size); - + /* * Find the vm object (if any) corresponding to this port. */ @@ -2318,6 +2394,50 @@ vm_map_enter_mem_object( return KERN_INVALID_OBJECT; } + if (object != VM_OBJECT_NULL && + object->named && + object->pager != MEMORY_OBJECT_NULL && + object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { + memory_object_t pager; + vm_prot_t pager_prot; + kern_return_t kr; + + /* + * For "named" VM objects, let the pager know that the + * memory object is being mapped. Some pagers need to keep + * track of this, to know when they can reclaim the memory + * object, for example. + * VM calls memory_object_map() for each mapping (specifying + * the protection of each mapping) and calls + * memory_object_last_unmap() when all the mappings are gone. + */ + pager_prot = max_protection; + if (copy) { + /* + * Copy-On-Write mapping: won't modify the + * memory object. + */ + pager_prot &= ~VM_PROT_WRITE; + } + vm_object_lock(object); + pager = object->pager; + if (object->named && + pager != MEMORY_OBJECT_NULL && + object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { + assert(object->pager_ready); + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); + vm_object_unlock(object); + + kr = memory_object_map(pager, pager_prot); + assert(kr == KERN_SUCCESS); + + vm_object_lock(object); + vm_object_mapping_end(object); + } + vm_object_unlock(object); + } + /* * Perform the copy if requested */ @@ -3035,6 +3155,11 @@ vm_map_protect( vm_map_lock(map); + if ((new_prot & VM_PROT_COPY) && !map->prot_copy_allow) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + /* LP64todo - remove this check when vm_map_commpage64() * no longer has to stuff in a map_entry for the commpage * above the map's max_offset. @@ -3085,6 +3210,15 @@ vm_map_protect( } } +#if CONFIG_EMBEDDED + if (new_prot & VM_PROT_WRITE) { + if (new_prot & VM_PROT_EXECUTE) { + printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__); + new_prot &= ~VM_PROT_EXECUTE; + } + } +#endif + prev = current->vme_end; current = current->vme_next; } @@ -6101,15 +6235,6 @@ vm_map_copy_overwrite_aligned( entry->wired_count = 0; entry->user_wired_count = 0; offset = entry->offset = copy_entry->offset; - /* - * XXX FBDP - * We should propagate the submap entry's protections - * here instead of forcing VM_PROT_ALL. - * Or better yet, we should inherit the protection - * of the copy_entry. - */ - entry->protection = VM_PROT_ALL; - entry->max_protection = VM_PROT_ALL; vm_map_copy_entry_unlink(copy, copy_entry); vm_map_copy_entry_dispose(copy, copy_entry); @@ -10853,6 +10978,11 @@ vm_map_page_info( if (m->speculative) *disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; + if (m->cs_validated) + *disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED; + if (m->cs_tainted) + *disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED; + page_query_done: vm_object_unlock(object); @@ -11499,3 +11629,11 @@ vm_map_set_user_wire_limit(vm_map_t map, { map->user_wire_limit = limit; } + +void vm_map_set_prot_copy_allow(vm_map_t map, + boolean_t allow) +{ + vm_map_lock(map); + map->prot_copy_allow = allow; + vm_map_unlock(map); +}; diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 0b181f35b..423930b97 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -284,6 +284,7 @@ struct _vm_map { boolean_t wiring_required;/* All memory wired? */ boolean_t no_zero_fill; /* No zero fill absent pages */ boolean_t mapped; /* has this map been mapped */ + boolean_t prot_copy_allow;/* is VM_PROT_COPY allowed on this map */ unsigned int timestamp; /* Version number */ unsigned int color_rr; /* next color (not protected by a lock) */ } ; @@ -923,6 +924,10 @@ extern void vm_map_set_user_wire_limit( vm_map_t map, vm_size_t limit); +extern void vm_map_set_prot_copy_allow( + vm_map_t map, + boolean_t allow); + #ifdef MACH_KERNEL_PRIVATE /* diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 680c07f12..d290fa801 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -520,6 +520,7 @@ vm_object_bootstrap(void) /* cache bitfields */ vm_object_template.wimg_bits = VM_WIMG_DEFAULT; vm_object_template.code_signed = FALSE; + vm_object_template.mapping_in_progress = FALSE; vm_object_template.not_in_use = 0; #ifdef UPL_DEBUG vm_object_template.uplq.prev = NULL; @@ -753,10 +754,12 @@ vm_object_deallocate( /* more mappers for this object */ if (pager != MEMORY_OBJECT_NULL) { + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); vm_object_unlock(object); vm_object_cache_unlock(); - memory_object_unmap(pager); + memory_object_last_unmap(pager); try_failed_count = 0; for (;;) { @@ -777,6 +780,8 @@ vm_object_deallocate( mutex_pause(try_failed_count); /* wait a bit */ } assert(object->ref_count > 0); + + vm_object_mapping_end(object); } } @@ -2210,7 +2215,12 @@ vm_object_copy_slowly( /* fall thru */ case VM_FAULT_INTERRUPTED: + vm_object_lock(new_object); + vm_page_lock_queues(); vm_page_free(new_page); + vm_page_unlock_queues(); + vm_object_unlock(new_object); + vm_object_deallocate(new_object); vm_object_deallocate(src_object); *_result_object = VM_OBJECT_NULL; @@ -2225,9 +2235,11 @@ vm_object_copy_slowly( * any page fails [chosen] */ + vm_object_lock(new_object); vm_page_lock_queues(); vm_page_free(new_page); vm_page_unlock_queues(); + vm_object_unlock(new_object); vm_object_deallocate(new_object); vm_object_deallocate(src_object); @@ -3663,7 +3675,7 @@ vm_object_do_bypass( * Since its ref_count was at least 2, it * will not vanish; so we don't need to call * vm_object_deallocate. - * [FBDP: that doesn't seem to be true any more] + * [with a caveat for "named" objects] * * The res_count on the backing object is * conditionally decremented. It's possible @@ -3681,7 +3693,8 @@ vm_object_do_bypass( * is temporary and cachable. #endif */ - if (backing_object->ref_count > 1) { + if (backing_object->ref_count > 2 || + (!backing_object->named && backing_object->ref_count > 1)) { vm_object_lock_assert_exclusive(backing_object); backing_object->ref_count--; #if TASK_SWAPPER @@ -4067,10 +4080,11 @@ vm_object_collapse( * backing object that show through to the object. */ #if MACH_PAGEMAP - if (backing_rcount || backing_object->existence_map) { + if (backing_rcount || backing_object->existence_map) #else - if (backing_rcount) { + if (backing_rcount) #endif /* MACH_PAGEMAP */ + { offset = hint_offset; while((offset = @@ -5132,6 +5146,9 @@ vm_object_lock_request( return (KERN_SUCCESS); } +unsigned int vm_page_purged_wired = 0; +unsigned int vm_page_purged_busy = 0; +unsigned int vm_page_purged_others = 0; /* * Empty a purgeable object by grabbing the physical pages assigned to it and * putting them on the free queue without writing them to backing store, etc. @@ -5200,18 +5217,38 @@ vm_object_purge(vm_object_t object) /* resume with the current page and a new quota */ purge_loop_quota = PURGE_LOOP_QUOTA; } - - - if (p->busy || p->cleaning || p->laundry || - p->list_req_pending) { - /* page is being acted upon, so don't mess with it */ - continue; - } + if (p->wire_count) { /* don't discard a wired page */ + vm_page_purged_wired++; + + skip_page: + /* + * This page is no longer "purgeable", + * for accounting purposes. + */ + assert(vm_page_purgeable_count > 0); + vm_page_purgeable_count--; continue; } + if (p->busy) { + /* + * We can't reclaim a busy page but we can deactivate + * it (if it's not wired) to make sure it gets + * considered by vm_pageout_scan() later. + */ + vm_page_deactivate(p); + vm_page_purged_busy++; + goto skip_page; + } + + if (p->cleaning || p->laundry || p->list_req_pending) { + /* page is being acted upon, so don't mess with it */ + vm_page_purged_others++; + goto skip_page; + } + assert(!p->laundry); assert(p->object != kernel_object); @@ -5237,6 +5274,12 @@ vm_object_purge(vm_object_t object) } vm_page_free_prepare(p); + /* + * vm_page_purgeable_count is not updated when freeing + * a page from an "empty" object, so do it explicitly here. + */ + assert(vm_page_purgeable_count > 0); + vm_page_purgeable_count--; /* ... and put it on our queue of pages to free */ assert(p->pageq.next == NULL && @@ -5379,11 +5422,11 @@ vm_object_purgable_control( if (old_state != VM_PURGABLE_NONVOLATILE) { vm_page_lock_queues(); - assert(vm_page_purgeable_count >= - object->resident_page_count); - vm_page_purgeable_count -= object->resident_page_count; - if (old_state==VM_PURGABLE_VOLATILE) { + assert(vm_page_purgeable_count >= + object->resident_page_count); + vm_page_purgeable_count -= object->resident_page_count; + assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */ purgeable_q_t queue = vm_purgeable_object_remove(object); assert(queue); @@ -5397,13 +5440,14 @@ vm_object_purgable_control( case VM_PURGABLE_VOLATILE: - if ((old_state != VM_PURGABLE_NONVOLATILE) && (old_state != VM_PURGABLE_VOLATILE)) + if (old_state == VM_PURGABLE_EMPTY && + object->resident_page_count == 0) break; purgeable_q_t queue; /* find the correct queue */ if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE) - queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO]; + queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE]; else { if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO) queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO]; @@ -5411,7 +5455,8 @@ vm_object_purgable_control( queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO]; } - if (old_state == VM_PURGABLE_NONVOLATILE) { + if (old_state == VM_PURGABLE_NONVOLATILE || + old_state == VM_PURGABLE_EMPTY) { /* try to add token... this can fail */ vm_page_lock_queues(); @@ -5474,10 +5519,12 @@ vm_object_purgable_control( vm_purgeable_token_delete_first(old_queue); } - if (old_state==VM_PURGABLE_NONVOLATILE) { - vm_page_purgeable_count += object->resident_page_count; + if (old_state==VM_PURGABLE_NONVOLATILE || + old_state == VM_PURGABLE_EMPTY) { vm_page_lock_queues(); + vm_page_purgeable_count += object->resident_page_count; } + object->purgable = VM_PURGABLE_VOLATILE; (void) vm_object_purge(object); vm_page_unlock_queues(); } diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index a093bf2f4..68f60ef83 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -288,7 +288,8 @@ struct vm_object { code_signed:1, /* pages are signed and should be validated; the signatures are stored with the pager */ - not_in_use:23; /* for expansion */ + mapping_in_progress:1, /* pager being mapped/unmapped */ + not_in_use:22; /* for expansion */ #ifdef UPL_DEBUG queue_head_t uplq; /* List of outstanding upls */ @@ -637,6 +638,7 @@ extern kern_return_t vm_object_range_op( #define VM_OBJECT_EVENT_INITIALIZED 0 #define VM_OBJECT_EVENT_PAGER_READY 1 #define VM_OBJECT_EVENT_PAGING_IN_PROGRESS 2 +#define VM_OBJECT_EVENT_MAPPING_IN_PROGRESS 3 #define VM_OBJECT_EVENT_LOCK_IN_PROGRESS 4 #define VM_OBJECT_EVENT_UNCACHING 5 #define VM_OBJECT_EVENT_COPY_CALL 6 @@ -725,6 +727,38 @@ extern kern_return_t vm_object_range_op( MACRO_END +#define vm_object_mapping_begin(object) \ + MACRO_BEGIN \ + vm_object_lock_assert_exclusive((object)); \ + assert(! (object)->mapping_in_progress); \ + (object)->mapping_in_progress = TRUE; \ + MACRO_END + +#define vm_object_mapping_end(object) \ + MACRO_BEGIN \ + vm_object_lock_assert_exclusive((object)); \ + assert((object)->mapping_in_progress); \ + (object)->mapping_in_progress = FALSE; \ + vm_object_wakeup((object), \ + VM_OBJECT_EVENT_MAPPING_IN_PROGRESS); \ + MACRO_END + +#define vm_object_mapping_wait(object, interruptible) \ + MACRO_BEGIN \ + vm_object_lock_assert_exclusive((object)); \ + while ((object)->mapping_in_progress) { \ + wait_result_t _wr; \ + \ + _wr = vm_object_sleep((object), \ + VM_OBJECT_EVENT_MAPPING_IN_PROGRESS, \ + (interruptible)); \ + /*XXX if ((interruptible) && (_wr != THREAD_AWAKENED))*/\ + /*XXX break; */ \ + } \ + assert(!(object)->mapping_in_progress); \ + MACRO_END + + #define OBJECT_LOCK_SHARED 0 #define OBJECT_LOCK_EXCLUSIVE 1 diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 3319ca213..89310abe8 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -531,6 +531,9 @@ extern void vm_page_gobble( vm_page_t page); extern void vm_page_validate_cs(vm_page_t page); +extern void vm_page_validate_cs_mapped( + vm_page_t page, + const void *kaddr); /* * Functions implemented as macros. m->wanted and m->busy are diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 18ff4907a..9502c60ae 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -114,12 +114,8 @@ #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */ -#ifdef CONFIG_EMBEDDED -#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 2048 -#else #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100 #endif -#endif #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ #ifdef CONFIG_EMBEDDED @@ -1014,7 +1010,7 @@ vm_pageout_scan(void) * Don't sweep through active queue more than the throttle * which should be kept relatively low */ - active_burst_count = vm_pageout_burst_active_throttle; + active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count); /* * Move pages from active to inactive. @@ -1171,9 +1167,23 @@ vm_pageout_scan(void) * inactive target still not met... keep going * until we get the queues balanced */ + + /* + * Recalculate vm_page_inactivate_target. + */ + vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count + + vm_page_speculative_count); + +#ifndef CONFIG_EMBEDDED + /* + * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying + * to balance the queues + */ if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && !queue_empty(&vm_page_queue_active)) continue; +#endif mutex_lock(&vm_page_queue_free_lock); @@ -1257,7 +1267,10 @@ vm_pageout_scan(void) msecs = vm_pageout_empty_wait; goto vm_pageout_scan_delay; - } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) { + } else if (inactive_burst_count >= + MIN(vm_pageout_burst_inactive_throttle, + (vm_page_inactive_count + + vm_page_speculative_count))) { vm_pageout_scan_burst_throttle++; msecs = vm_pageout_burst_wait; goto vm_pageout_scan_delay; @@ -3307,15 +3320,17 @@ vm_object_upl_request( upl->highest_page = dst_page->phys_page; if (user_page_list) { user_page_list[entry].phys_addr = dst_page->phys_page; - user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].pageout = dst_page->pageout; user_page_list[entry].absent = dst_page->absent; + user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; - + user_page_list[entry].device = FALSE; if (dst_page->clustered == TRUE) user_page_list[entry].speculative = dst_page->speculative; else user_page_list[entry].speculative = FALSE; + user_page_list[entry].cs_validated = dst_page->cs_validated; + user_page_list[entry].cs_tainted = dst_page->cs_tainted; } /* * if UPL_RET_ONLY_ABSENT is set, then @@ -4003,6 +4018,23 @@ upl_commit_range( } delayed_unlock = 1; + if (shadow_object->code_signed) { + /* + * CODE SIGNING: + * If the object is code-signed, do not let this UPL tell + * us if the pages are valid or not. Let the pages be + * validated by VM the normal way (when they get mapped or + * copied). + */ + flags &= ~UPL_COMMIT_CS_VALIDATED; + } + if (! page_list) { + /* + * No page list to get the code-signing info from !? + */ + flags &= ~UPL_COMMIT_CS_VALIDATED; + } + while (xfer_size) { vm_page_t t, m; @@ -4030,60 +4062,34 @@ upl_commit_range( m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset); } } - if (m != VM_PAGE_NULL) { - - clear_refmod = 0; + if (m == VM_PAGE_NULL) { + goto commit_next_page; + } - if (upl->flags & UPL_IO_WIRE) { + clear_refmod = 0; - vm_page_unwire(m); - - if (page_list) - page_list[entry].phys_addr = 0; + if (flags & UPL_COMMIT_CS_VALIDATED) { + /* + * CODE SIGNING: + * Set the code signing bits according to + * what the UPL says they should be. + */ + m->cs_validated = page_list[entry].cs_validated; + m->cs_tainted = page_list[entry].cs_tainted; + } + if (upl->flags & UPL_IO_WIRE) { - if (flags & UPL_COMMIT_SET_DIRTY) - m->dirty = TRUE; - else if (flags & UPL_COMMIT_CLEAR_DIRTY) { - m->dirty = FALSE; - if (m->cs_validated && !m->cs_tainted) { - /* - * CODE SIGNING: - * This page is no longer dirty - * but could have been modified, - * so it will need to be - * re-validated. - */ - m->cs_validated = FALSE; - vm_cs_validated_resets++; - } - clear_refmod |= VM_MEM_MODIFIED; - } - if (flags & UPL_COMMIT_INACTIVATE) - vm_page_deactivate(m); + vm_page_unwire(m); - if (clear_refmod) - pmap_clear_refmod(m->phys_page, clear_refmod); + if (page_list) + page_list[entry].phys_addr = 0; - if (flags & UPL_COMMIT_ALLOW_ACCESS) { - /* - * We blocked access to the pages in this UPL. - * Clear the "busy" bit and wake up any waiter - * for this page. - */ - PAGE_WAKEUP_DONE(m); - } - goto commit_next_page; - } - /* - * make sure to clear the hardware - * modify or reference bits before - * releasing the BUSY bit on this page - * otherwise we risk losing a legitimate - * change of state - */ - if (flags & UPL_COMMIT_CLEAR_DIRTY) { - m->dirty = FALSE; - if (m->cs_validated && !m->cs_tainted) { + if (flags & UPL_COMMIT_SET_DIRTY) + m->dirty = TRUE; + else if (flags & UPL_COMMIT_CLEAR_DIRTY) { + m->dirty = FALSE; + if (! (flags & UPL_COMMIT_CS_VALIDATED) && + m->cs_validated && !m->cs_tainted) { /* * CODE SIGNING: * This page is no longer dirty @@ -4096,109 +4102,89 @@ upl_commit_range( } clear_refmod |= VM_MEM_MODIFIED; } - if (clear_refmod) - pmap_clear_refmod(m->phys_page, clear_refmod); - - if (page_list) { - upl_page_info_t *p; + + if (flags & UPL_COMMIT_INACTIVATE) + vm_page_deactivate(m); - p = &(page_list[entry]); + if (clear_refmod) + pmap_clear_refmod(m->phys_page, clear_refmod); - if (p->phys_addr && p->pageout && !m->pageout) { - m->busy = TRUE; - m->pageout = TRUE; - vm_page_wire(m); - } else if (p->phys_addr && - !p->pageout && m->pageout && - !m->dump_cleaning) { - m->pageout = FALSE; - m->absent = FALSE; - m->overwriting = FALSE; - vm_page_unwire(m); + if (flags & UPL_COMMIT_ALLOW_ACCESS) { + /* + * We blocked access to the pages in this UPL. + * Clear the "busy" bit and wake up any waiter + * for this page. + */ + PAGE_WAKEUP_DONE(m); + } + goto commit_next_page; + } + /* + * make sure to clear the hardware + * modify or reference bits before + * releasing the BUSY bit on this page + * otherwise we risk losing a legitimate + * change of state + */ + if (flags & UPL_COMMIT_CLEAR_DIRTY) { + m->dirty = FALSE; - PAGE_WAKEUP_DONE(m); - } - page_list[entry].phys_addr = 0; + if (! (flags & UPL_COMMIT_CS_VALIDATED) && + m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; +#if DEVELOPMENT || DEBUG + vm_cs_validated_resets++; +#endif } - m->dump_cleaning = FALSE; + clear_refmod |= VM_MEM_MODIFIED; + } + if (clear_refmod) + pmap_clear_refmod(m->phys_page, clear_refmod); - if (m->laundry) - vm_pageout_throttle_up(m); + if (page_list) { + upl_page_info_t *p; - if (m->pageout) { - m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; + p = &(page_list[entry]); + + if (p->phys_addr && p->pageout && !m->pageout) { + m->busy = TRUE; + m->pageout = TRUE; + vm_page_wire(m); + } else if (p->phys_addr && + !p->pageout && m->pageout && + !m->dump_cleaning) { m->pageout = FALSE; -#if MACH_CLUSTER_STATS - if (m->wanted) vm_pageout_target_collisions++; -#endif - m->dirty = FALSE; - if (m->cs_validated && !m->cs_tainted) { - /* - * CODE SIGNING: - * This page is no longer dirty - * but could have been modified, - * so it will need to be - * re-validated. - */ - m->cs_validated = FALSE; - vm_cs_validated_resets++; - } - - if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)) - m->dirty = TRUE; - - if (m->dirty) { - /* - * page was re-dirtied after we started - * the pageout... reactivate it since - * we don't know whether the on-disk - * copy matches what is now in memory - */ - vm_page_unwire(m); - - if (upl->flags & UPL_PAGEOUT) { - CLUSTER_STAT(vm_pageout_target_page_dirtied++;) - VM_STAT_INCR(reactivations); - DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); - } - PAGE_WAKEUP_DONE(m); - } else { - /* - * page has been successfully cleaned - * go ahead and free it for other use - */ + m->absent = FALSE; + m->overwriting = FALSE; + vm_page_unwire(m); + + PAGE_WAKEUP_DONE(m); + } + page_list[entry].phys_addr = 0; + } + m->dump_cleaning = FALSE; - if (m->object->internal) { - DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL); - } else { - DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL); - } + if (m->laundry) + vm_pageout_throttle_up(m); - vm_page_free(m); - - if (upl->flags & UPL_PAGEOUT) { - CLUSTER_STAT(vm_pageout_target_page_freed++;) - - if (page_list[entry].dirty) { - VM_STAT_INCR(pageouts); - DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); - pgpgout_count++; - } - } - } - goto commit_next_page; - } + if (m->pageout) { + m->cleaning = FALSE; + m->encrypted_cleaning = FALSE; + m->pageout = FALSE; #if MACH_CLUSTER_STATS - if (m->wpmapped) - m->dirty = pmap_is_modified(m->phys_page); - - if (m->dirty) vm_pageout_cluster_dirtied++; - else vm_pageout_cluster_cleaned++; - if (m->wanted) vm_pageout_cluster_collisions++; + if (m->wanted) vm_pageout_target_collisions++; #endif m->dirty = FALSE; - if (m->cs_validated && !m->cs_tainted) { + + if (! (flags & UPL_COMMIT_CS_VALIDATED) && + m->cs_validated && !m->cs_tainted) { /* * CODE SIGNING: * This page is no longer dirty @@ -4207,67 +4193,138 @@ upl_commit_range( * re-validated. */ m->cs_validated = FALSE; +#if DEVELOPMENT || DEBUG vm_cs_validated_resets++; +#endif } - - if ((m->busy) && (m->cleaning)) { - /* - * the request_page_list case + + if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)) + m->dirty = TRUE; + + if (m->dirty) { + /* + * page was re-dirtied after we started + * the pageout... reactivate it since + * we don't know whether the on-disk + * copy matches what is now in memory */ - m->absent = FALSE; - m->overwriting = FALSE; - m->busy = FALSE; - } else if (m->overwriting) { - /* - * alternate request page list, write to - * page_list case. Occurs when the original - * page was wired at the time of the list - * request + vm_page_unwire(m); + + if (upl->flags & UPL_PAGEOUT) { + CLUSTER_STAT(vm_pageout_target_page_dirtied++;) + VM_STAT_INCR(reactivations); + DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); + } + PAGE_WAKEUP_DONE(m); + } else { + /* + * page has been successfully cleaned + * go ahead and free it for other use */ - assert(m->wire_count != 0); - vm_page_unwire(m);/* reactivates */ - m->overwriting = FALSE; + + if (m->object->internal) { + DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL); + } else { + DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL); + } + + vm_page_free(m); + + if (upl->flags & UPL_PAGEOUT) { + CLUSTER_STAT(vm_pageout_target_page_freed++;) + + if (page_list[entry].dirty) { + VM_STAT_INCR(pageouts); + DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); + pgpgout_count++; + } + } } - m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; + goto commit_next_page; + } +#if MACH_CLUSTER_STATS + if (m->wpmapped) + m->dirty = pmap_is_modified(m->phys_page); + + if (m->dirty) vm_pageout_cluster_dirtied++; + else vm_pageout_cluster_cleaned++; + if (m->wanted) vm_pageout_cluster_collisions++; +#endif + m->dirty = FALSE; + if (! (flags & UPL_COMMIT_CS_VALIDATED) && + m->cs_validated && !m->cs_tainted) { /* - * It is a part of the semantic of COPYOUT_FROM - * UPLs that a commit implies cache sync - * between the vm page and the backing store - * this can be used to strip the precious bit - * as well as clean + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. */ - if (upl->flags & UPL_PAGE_SYNC_DONE) - m->precious = FALSE; - - if (flags & UPL_COMMIT_SET_DIRTY) - m->dirty = TRUE; + m->cs_validated = FALSE; +#if DEVELOPMENT || DEBUG + vm_cs_validated_resets++; +#endif + } - if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) { + if ((m->busy) && (m->cleaning)) { + /* + * the request_page_list case + */ + m->absent = FALSE; + m->overwriting = FALSE; + m->busy = FALSE; + } else if (m->overwriting) { + /* + * alternate request page list, write to + * page_list case. Occurs when the original + * page was wired at the time of the list + * request + */ + assert(m->wire_count != 0); + vm_page_unwire(m);/* reactivates */ + m->overwriting = FALSE; + } + m->cleaning = FALSE; + m->encrypted_cleaning = FALSE; + + /* + * It is a part of the semantic of COPYOUT_FROM + * UPLs that a commit implies cache sync + * between the vm page and the backing store + * this can be used to strip the precious bit + * as well as clean + */ + if (upl->flags & UPL_PAGE_SYNC_DONE) + m->precious = FALSE; + + if (flags & UPL_COMMIT_SET_DIRTY) + m->dirty = TRUE; + + if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) { + vm_page_deactivate(m); + } else if (!m->active && !m->inactive && !m->speculative) { + + if (m->clustered) + vm_page_speculate(m, TRUE); + else if (m->reference) + vm_page_activate(m); + else vm_page_deactivate(m); - } else if (!m->active && !m->inactive && !m->speculative) { - - if (m->clustered) - vm_page_speculate(m, TRUE); - else if (m->reference) - vm_page_activate(m); - else - vm_page_deactivate(m); - } - if (flags & UPL_COMMIT_ALLOW_ACCESS) { - /* - * We blocked access to the pages in this URL. - * Clear the "busy" bit on this page before we - * wake up any waiter. - */ - m->busy = FALSE; - } + } + if (flags & UPL_COMMIT_ALLOW_ACCESS) { /* - * Wakeup any thread waiting for the page to be un-cleaning. + * We blocked access to the pages in this URL. + * Clear the "busy" bit on this page before we + * wake up any waiter. */ - PAGE_WAKEUP(m); + m->busy = FALSE; } + /* + * Wakeup any thread waiting for the page to be un-cleaning. + */ + PAGE_WAKEUP(m); + commit_next_page: target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; @@ -5013,15 +5070,17 @@ vm_object_iopl_request( if (user_page_list) { user_page_list[entry].phys_addr = dst_page->phys_page; - user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].pageout = dst_page->pageout; user_page_list[entry].absent = dst_page->absent; + user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; - + user_page_list[entry].device = FALSE; if (dst_page->clustered == TRUE) user_page_list[entry].speculative = dst_page->speculative; else user_page_list[entry].speculative = FALSE; + user_page_list[entry].cs_validated = dst_page->cs_validated; + user_page_list[entry].cs_tainted = dst_page->cs_tainted; } /* * someone is explicitly grabbing this page... @@ -5274,6 +5333,7 @@ vm_paging_map_object( vm_object_t object, vm_object_offset_t offset, vm_map_size_t *size, + vm_prot_t protection, boolean_t can_unlock_object) { kern_return_t kr; @@ -5282,7 +5342,7 @@ vm_paging_map_object( vm_object_offset_t object_offset; int i; - + if (page != VM_PAGE_NULL && *size == PAGE_SIZE) { assert(page->busy); /* @@ -5356,7 +5416,7 @@ vm_paging_map_object( PMAP_ENTER(kernel_pmap, page_map_offset, page, - VM_PROT_DEFAULT, + protection, ((int) page->object->wimg_bits & VM_WIMG_MASK), TRUE); @@ -5400,7 +5460,7 @@ vm_paging_map_object( object, object_offset, FALSE, - VM_PROT_DEFAULT, + protection, VM_PROT_ALL, VM_INHERIT_NONE); if (kr != KERN_SUCCESS) { @@ -5445,14 +5505,13 @@ vm_paging_map_object( pmap_sync_page_data_phys(page->phys_page); } page->pmapped = TRUE; - page->wpmapped = TRUE; cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK; //assert(pmap_verify_free(page->phys_page)); PMAP_ENTER(kernel_pmap, *address + page_map_offset, page, - VM_PROT_DEFAULT, + protection, cache_attr, TRUE); } @@ -5689,6 +5748,7 @@ vm_page_encrypt( page->object, page->offset, &kernel_mapping_size, + VM_PROT_READ | VM_PROT_WRITE, FALSE); if (kr != KERN_SUCCESS) { panic("vm_page_encrypt: " @@ -5813,6 +5873,7 @@ vm_page_decrypt( page->object, page->offset, &kernel_mapping_size, + VM_PROT_READ | VM_PROT_WRITE, FALSE); if (kr != KERN_SUCCESS) { panic("vm_page_decrypt: " diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index c7ab4ca8f..d5adb8b0f 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -236,6 +236,7 @@ extern kern_return_t vm_paging_map_object( vm_object_t object, vm_object_offset_t offset, vm_map_size_t *size, + vm_prot_t protection, boolean_t can_unlock_object); extern void vm_paging_unmap_object( vm_object_t object, diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index f1d0f65af..e9fdc6ef3 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -144,15 +144,17 @@ extern mach_vm_offset_t mach_get_vm_end(vm_map_t); extern vm_offset_t get_vm_start(vm_map_t); extern vm_offset_t get_vm_end(vm_map_t); -#ifdef __i386__ +#if CONFIG_CODE_DECRYPTION +struct pager_crypt_info; extern kern_return_t vm_map_apple_protected( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + struct pager_crypt_info *crypt_info); extern void apple_protect_pager_bootstrap(void); -extern memory_object_t apple_protect_pager_setup(vm_object_t backing_object); -extern void apple_protect_pager_map(memory_object_t mem_obj); -#endif /* __i386__ */ +extern memory_object_t apple_protect_pager_setup(vm_object_t backing_object, + struct pager_crypt_info *crypt_info); +#endif /* CONFIG_CODE_DECRYPTION */ /* @@ -238,7 +240,10 @@ extern kern_return_t vnode_pager_synchronize( memory_object_offset_t offset, vm_size_t length, vm_sync_t sync_flags); -extern kern_return_t vnode_pager_unmap( +extern kern_return_t vnode_pager_map( + memory_object_t mem_obj, + vm_prot_t prot); +extern kern_return_t vnode_pager_last_unmap( memory_object_t mem_obj); extern void vnode_pager_deallocate( memory_object_t); @@ -248,6 +253,9 @@ extern void vnode_pager_vrele( struct vnode *vp); extern void vnode_pager_release_from_cache( int *); +extern int ubc_map( + struct vnode *vp, + int flags); extern void ubc_unmap( struct vnode *vp); @@ -282,7 +290,9 @@ extern kern_return_t dp_memory_object_synchronize(memory_object_t, memory_object_offset_t, vm_size_t, vm_sync_t); -extern kern_return_t dp_memory_object_unmap(memory_object_t); +extern kern_return_t dp_memory_object_map(memory_object_t, + vm_prot_t); +extern kern_return_t dp_memory_object_last_unmap(memory_object_t); #endif /* _memory_object_server_ */ #ifndef _memory_object_default_server_ extern kern_return_t default_pager_memory_object_create( @@ -321,7 +331,8 @@ extern kern_return_t device_pager_synchronize(memory_object_t, memory_object_offset_t, vm_size_t, vm_sync_t); -extern kern_return_t device_pager_unmap(memory_object_t); +extern kern_return_t device_pager_map(memory_object_t, vm_prot_t); +extern kern_return_t device_pager_last_unmap(memory_object_t); extern kern_return_t device_pager_populate_object( memory_object_t device, memory_object_offset_t offset, @@ -347,7 +358,7 @@ extern int macx_swapinfo( boolean_t *encrypted_p); extern void log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot); -extern int cs_invalid_page(void); +extern int cs_invalid_page(addr64_t vaddr); extern boolean_t cs_validate_page(void *blobs, memory_object_offset_t offset, const void *data, diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index ab0f00a42..bf80947ab 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -88,9 +88,14 @@ vm_purgeable_token_check_queue(purgeable_q_t queue) if (unripe) assert(queue->token_q_unripe == unripe); assert(token_cnt == queue->debug_count_tokens); - our_inactive_count = page_cnt + queue->new_pages + token_new_pagecount; - assert(our_inactive_count >= 0); - assert((uint32_t) our_inactive_count == vm_page_inactive_count); + + /* obsolete queue doesn't maintain token counts */ + if(queue->type != PURGEABLE_Q_TYPE_OBSOLETE) + { + our_inactive_count = page_cnt + queue->new_pages + token_new_pagecount; + assert(our_inactive_count >= 0); + assert((uint32_t) our_inactive_count == vm_page_inactive_count); + } } #endif @@ -515,11 +520,12 @@ vm_purgeable_object_purge_one(void) enum purgeable_q_type i; int group; vm_object_t object = 0; + purgeable_q_t queue, queue2; mutex_lock(&vm_purgeable_queue_lock); /* Cycle through all queues */ for (i = PURGEABLE_Q_TYPE_OBSOLETE; i < PURGEABLE_Q_TYPE_MAX; i++) { - purgeable_q_t queue = &purgeable_queues[i]; + queue = &purgeable_queues[i]; /* * Are there any ripe tokens on this queue? If yes, we'll @@ -536,17 +542,21 @@ vm_purgeable_object_purge_one(void) * lock, remove a token and then purge the object. */ for (group = 0; group < NUM_VOLATILE_GROUPS; group++) { - if (!queue_empty(&queue->objq[group]) && (object = vm_purgeable_object_find_and_lock(queue, group))) { + if (!queue_empty(&queue->objq[group]) && + (object = vm_purgeable_object_find_and_lock(queue, group))) { mutex_unlock(&vm_purgeable_queue_lock); vm_purgeable_token_choose_and_delete_ripe(queue, 0); goto purge_now; - } else { - assert(i != PURGEABLE_Q_TYPE_OBSOLETE); /* obsolete queue must - * have all objects in - * group 0 */ - purgeable_q_t queue2 = &purgeable_queues[i != PURGEABLE_Q_TYPE_FIFO ? PURGEABLE_Q_TYPE_FIFO : PURGEABLE_Q_TYPE_LIFO]; - - if (!queue_empty(&queue2->objq[group]) && (object = vm_purgeable_object_find_and_lock(queue2, group))) { + } + if (i != PURGEABLE_Q_TYPE_OBSOLETE) { + /* This is the token migration case, and it works between + * FIFO and LIFO only */ + queue2 = &purgeable_queues[i != PURGEABLE_Q_TYPE_FIFO ? + PURGEABLE_Q_TYPE_FIFO : + PURGEABLE_Q_TYPE_LIFO]; + + if (!queue_empty(&queue2->objq[group]) && + (object = vm_purgeable_object_find_and_lock(queue2, group))) { mutex_unlock(&vm_purgeable_queue_lock); vm_purgeable_token_choose_and_delete_ripe(queue2, queue); goto purge_now; @@ -611,7 +621,7 @@ vm_purgeable_object_remove(vm_object_t object) int group; mutex_lock(&vm_purgeable_queue_lock); - for (i = PURGEABLE_Q_TYPE_FIFO; i < PURGEABLE_Q_TYPE_MAX; i++) { + for (i = PURGEABLE_Q_TYPE_OBSOLETE; i < PURGEABLE_Q_TYPE_MAX; i++) { purgeable_q_t queue = &purgeable_queues[i]; for (group = 0; group < NUM_VOLATILE_GROUPS; group++) { vm_object_t o; diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index b92e35ae0..7e7520ce1 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -340,7 +340,7 @@ vm_page_set_colors( void ) { unsigned int n, override; - if ( PE_parse_boot_arg("colors", &override) ) /* colors specified as a boot-arg? */ + if ( PE_parse_boot_argn("colors", &override, sizeof (override)) ) /* colors specified as a boot-arg? */ n = override; else if ( vm_cache_geometry_colors ) /* do we know what the cache geometry is? */ n = vm_cache_geometry_colors; @@ -684,7 +684,7 @@ pmap_startup( * Check if we want to initialize pages to a known value */ fill = 0; /* Assume no fill */ - if (PE_parse_boot_arg("fill", &fillval)) fill = 1; /* Set fill */ + if (PE_parse_boot_argn("fill", &fillval, sizeof (fillval))) fill = 1; /* Set fill */ /* @@ -957,13 +957,19 @@ vm_page_insert_internal( object->resident_page_count++; - if (object->purgable == VM_PURGABLE_VOLATILE || - object->purgable == VM_PURGABLE_EMPTY) { + if (object->purgable == VM_PURGABLE_VOLATILE) { if (queues_lock_held == FALSE) vm_page_lockspin_queues(); vm_page_purgeable_count++; + if (queues_lock_held == FALSE) + vm_page_unlock_queues(); + } else if (object->purgable == VM_PURGABLE_EMPTY && + mem->throttled) { + if (queues_lock_held == FALSE) + vm_page_lock_queues(); + vm_page_deactivate(mem); if (queues_lock_held == FALSE) vm_page_unlock_queues(); } @@ -1053,8 +1059,7 @@ vm_page_replace( found_m->offset = (vm_object_offset_t) -1; object->resident_page_count--; - if (object->purgable == VM_PURGABLE_VOLATILE || - object->purgable == VM_PURGABLE_EMPTY) { + if (object->purgable == VM_PURGABLE_VOLATILE) { assert(vm_page_purgeable_count > 0); vm_page_purgeable_count--; } @@ -1079,9 +1084,12 @@ vm_page_replace( object->resident_page_count++; - if (object->purgable == VM_PURGABLE_VOLATILE || - object->purgable == VM_PURGABLE_EMPTY) { + if (object->purgable == VM_PURGABLE_VOLATILE) { vm_page_purgeable_count++; + } else if (object->purgable == VM_PURGABLE_EMPTY) { + if (mem->throttled) { + vm_page_deactivate(mem); + } } } @@ -1151,8 +1159,7 @@ vm_page_remove( mem->object->resident_page_count--; - if (mem->object->purgable == VM_PURGABLE_VOLATILE || - mem->object->purgable == VM_PURGABLE_EMPTY) { + if (mem->object->purgable == VM_PURGABLE_VOLATILE) { assert(vm_page_purgeable_count > 0); vm_page_purgeable_count--; } @@ -2306,6 +2313,24 @@ vm_page_wire( mem->zero_fill = FALSE; OSAddAtomic(-1, (SInt32 *)&vm_zf_count); } +#if CONFIG_EMBEDDED + { + int percent_avail; + + /* + * Decide if we need to poke the memorystatus notification thread. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / + atop_64(max_mem); + if (percent_avail <= (kern_memorystatus_level - 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif /* * ENCRYPTED SWAP: * The page could be encrypted, but @@ -2374,20 +2399,29 @@ vm_page_unwire( assert(!mem->laundry); assert(mem->object != kernel_object); assert(mem->pageq.next == NULL && mem->pageq.prev == NULL); - if (!IP_VALID(memory_manager_default) && - mem->dirty && mem->object->internal && - (mem->object->purgable == VM_PURGABLE_DENY || - mem->object->purgable == VM_PURGABLE_NONVOLATILE || - mem->object->purgable == VM_PURGABLE_VOLATILE)) { - queue_enter(&vm_page_queue_throttled, mem, vm_page_t, pageq); - vm_page_throttled_count++; - mem->throttled = TRUE; + if (mem->object->purgable == VM_PURGABLE_EMPTY) { + vm_page_deactivate(mem); } else { - queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); - vm_page_active_count++; - mem->active = TRUE; + vm_page_activate(mem); } - mem->reference = TRUE; +#if CONFIG_EMBEDDED + { + int percent_avail; + + /* + * Decide if we need to poke the memorystatus notification thread. + */ + percent_avail = + (vm_page_active_count + vm_page_inactive_count + + vm_page_speculative_count + vm_page_free_count + + (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 / + atop_64(max_mem); + if (percent_avail >= (kern_memorystatus_level + 5)) { + kern_memorystatus_level = percent_avail; + thread_wakeup((event_t)&kern_memorystatus_wakeup); + } + } +#endif } } @@ -2992,8 +3026,10 @@ vm_page_find_contiguous( unsigned int page_idx, start_idx; int free_considered, free_available; int substitute_needed; -#if MACH_ASSERT +#if DEBUG uint32_t tv_start_sec, tv_start_usec, tv_end_sec, tv_end_usec; +#endif +#if MACH_ASSERT int yielded = 0; int dumped_run = 0; int stolen_pages = 0; @@ -3004,7 +3040,8 @@ vm_page_find_contiguous( #if MACH_ASSERT vm_page_verify_free_lists(); - +#endif +#if DEBUG clock_get_system_microtime(&tv_start_sec, &tv_start_usec); #endif vm_page_lock_queues(); @@ -3373,7 +3410,7 @@ vm_page_find_contiguous( done_scanning: vm_page_unlock_queues(); -#if MACH_ASSERT +#if DEBUG clock_get_system_microtime(&tv_end_sec, &tv_end_usec); tv_end_sec -= tv_start_sec; @@ -3389,6 +3426,8 @@ vm_page_find_contiguous( printf("vm_find_page_contiguous(num=%d,low=%d): found %d pages in %d.%06ds... scanned %d pages... yielded %d times... dumped run %d times... stole %d pages\n", contig_pages, max_pnum, npages, tv_end_sec, tv_end_usec, page_idx, yielded, dumped_run, stolen_pages); +#endif +#if MACH_ASSERT vm_page_verify_free_lists(); #endif return m; diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 0ed0a84b2..e551e62bd 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -2870,6 +2870,10 @@ kernel_upl_commit_range( if (flags & UPL_COMMIT_FREE_ON_EMPTY) flags |= UPL_COMMIT_NOTIFY_EMPTY; + if (flags & UPL_COMMIT_KERNEL_ONLY_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + kr = upl_commit_range(upl, offset, size, flags, pl, count, &finished); if ((flags & UPL_COMMIT_NOTIFY_EMPTY) && finished) diff --git a/pexpert/gen/pe_gen.c b/pexpert/gen/pe_gen.c index 32be95e27..43ab61a96 100644 --- a/pexpert/gen/pe_gen.c +++ b/pexpert/gen/pe_gen.c @@ -39,11 +39,11 @@ int32_t gPESerialBaud = -1; void pe_init_debug(void) { - if (!PE_parse_boot_arg("debug", &DEBUGFlag)) + if (!PE_parse_boot_argn("debug", &DEBUGFlag, sizeof (DEBUGFlag))) DEBUGFlag = 0; } -void PE_enter_debugger(char *cause) +void PE_enter_debugger(const char *cause) { if (DEBUGFlag & DB_NMI) Debugger(cause); diff --git a/pexpert/i386/pe_interrupt.c b/pexpert/i386/pe_interrupt.c index 61b306419..316bf9780 100644 --- a/pexpert/i386/pe_interrupt.c +++ b/pexpert/i386/pe_interrupt.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include diff --git a/pexpert/i386/pe_kprintf.c b/pexpert/i386/pe_kprintf.c index 7db76a179..84855f63e 100644 --- a/pexpert/i386/pe_kprintf.c +++ b/pexpert/i386/pe_kprintf.c @@ -53,7 +53,7 @@ void PE_init_kprintf(boolean_t vm_initialized) if (!vm_initialized) { simple_lock_init(&kprintf_lock, 0); - if (PE_parse_boot_arg("debug", &boot_arg)) + if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) if (boot_arg & DB_KPRT) disable_serial_output = FALSE; diff --git a/pexpert/i386/pe_misc.s b/pexpert/i386/pe_misc.s index 71bc46014..5bad35b4b 100644 --- a/pexpert/i386/pe_misc.s +++ b/pexpert/i386/pe_misc.s @@ -40,6 +40,7 @@ ENTRY(PE_get_timebase) movl S_ARG0, %ecx rdtsc + lfence movl %edx, 0(%ecx) movl %eax, 4(%ecx) diff --git a/pexpert/i386/pe_serial.c b/pexpert/i386/pe_serial.c index 2080b6024..ddb48b162 100644 --- a/pexpert/i386/pe_serial.c +++ b/pexpert/i386/pe_serial.c @@ -52,7 +52,8 @@ enum { UART_LCR = 3, /* line control register */ UART_MCR = 4, /* modem control register */ UART_LSR = 5, /* line status register */ - UART_MSR = 6 /* modem status register */ + UART_MSR = 6, /* modem status register */ + UART_SCR = 7 /* scratch register */ }; enum { @@ -90,14 +91,12 @@ static int uart_initted = 0; /* 1 if init'ed */ static int uart_probe( void ) { - /* Verify that the Divisor Register is accessible */ - - WRITE( LCR, UART_LCR_DLAB ); - WRITE( DLL, 0x5a ); - if (READ(DLL) != 0x5a) return 0; - WRITE( DLL, 0xa5 ); - if (READ(DLL) != 0xa5) return 0; - WRITE( LCR, 0x00 ); + /* Verify that the Scratch Register is accessible */ + + WRITE( SCR, 0x5a ); + if (READ(SCR) != 0x5a) return 0; + WRITE( SCR, 0xa5 ); + if (READ(SCR) != 0xa5) return 0; return 1; } @@ -177,7 +176,7 @@ int serial_init( void ) /* Set baud rate - use the supplied boot-arg if available */ - if (PE_parse_boot_arg("serialbaud", &serial_baud_rate)) + if (PE_parse_boot_argn("serialbaud", &serial_baud_rate, sizeof (serial_baud_rate))) { /* Valid divisor? */ if (!((UART_CLOCK / 16) % serial_baud_rate)) { diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index 5284266a8..d8a013397 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -51,13 +51,15 @@ typedef void *cpu_id_t; void PE_enter_debugger( - char *cause); + const char *cause); void PE_init_platform( boolean_t vm_initialized, void *args); + + void PE_init_kprintf( boolean_t vm_initialized); diff --git a/pexpert/ppc/pe_kprintf.c b/pexpert/ppc/pe_kprintf.c index 9dffd0de0..2509d6962 100644 --- a/pexpert/ppc/pe_kprintf.c +++ b/pexpert/ppc/pe_kprintf.c @@ -63,7 +63,7 @@ void PE_init_kprintf(__unused boolean_t vm_initialized) if (PE_state.initialized == FALSE) panic("Platform Expert not initialized"); - if (PE_parse_boot_arg("debug", &boot_arg)) + if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) if(boot_arg & DB_KPRT) disable_serial_output = FALSE; if (DTLookupEntry(NULL, "/options", &options) == kSuccess) { @@ -92,7 +92,7 @@ void PE_init_kprintf(__unused boolean_t vm_initialized) } /* Check the boot-args for new serial baud. */ - if (PE_parse_boot_arg("serialbaud", &serial_baud)) + if (PE_parse_boot_argn("serialbaud", &serial_baud, sizeof (serial_baud))) if (serial_baud != -1) gPESerialBaud = serial_baud; if( (scc = PE_find_scc())) { /* See if we can find the serial port */ diff --git a/security/mac_framework.h b/security/mac_framework.h index 4b9613d2f..58f3e2b33 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -318,6 +318,7 @@ int mac_proc_check_getaudit(proc_t proc); int mac_proc_check_getauid(proc_t proc); int mac_proc_check_getlcid(proc_t proc1, proc_t proc2, pid_t pid); +int mac_proc_check_map_prot_copy_allow(proc_t proc); int mac_proc_check_mprotect(proc_t proc, user_addr_t addr, user_size_t size, int prot); int mac_proc_check_sched(proc_t proc, proc_t proc2); @@ -439,6 +440,8 @@ int mac_vnode_check_exchangedata(vfs_context_t ctx, struct vnode *v1, struct vnode *v2); int mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, struct image_params *imgp); +int mac_vnode_check_signature(struct vnode *vp, unsigned char *sha1, + void * signature, size_t size); int mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, struct attrlist *alist); int mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp, diff --git a/security/mac_policy.h b/security/mac_policy.h index a12cacb30..21b645a73 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -4407,6 +4407,25 @@ typedef int mpo_proc_check_get_task_t( kauth_cred_t cred, struct proc *p ); + + +/** + @brief Access control check for manipulating a proc's vm_map + @param cred Subject credential + @param proc Object process + + Determine whether the vm_map map belonging to process proc with + credential cred allows the VM_PROT_COPY operation. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. + */ +typedef int mpo_proc_check_map_prot_copy_allow_t( + kauth_cred_t cred, + struct proc *p +); + + /** @brief Assign a label to a new kernelspace Mach task @param kproc New task @@ -4714,6 +4733,13 @@ typedef int mpo_vnode_check_exec_t( struct componentname *cnp, u_int *csflags ); +/** + @brief Access control check after determining the code directory hash + */ +typedef int mpo_vnode_check_signature_t(struct vnode *vp, struct label *label, + unsigned char *sha1, void *signature, + int size); + /** @brief Access control check for retrieving file attributes @param cred Subject credential @@ -6003,8 +6029,8 @@ struct mac_policy_ops { mpo_vnode_label_update_extattr_t *mpo_vnode_label_update_extattr; mpo_vnode_label_update_t *mpo_vnode_label_update; mpo_vnode_notify_create_t *mpo_vnode_notify_create; - mpo_reserved_hook_t *mpo_reserved0; - mpo_reserved_hook_t *mpo_reserved1; + mpo_vnode_check_signature_t *mpo_vnode_check_signature; + mpo_proc_check_map_prot_copy_allow_t *mpo_proc_check_map_prot_copy_allow; mpo_reserved_hook_t *mpo_reserved2; mpo_reserved_hook_t *mpo_reserved3; mpo_reserved_hook_t *mpo_reserved4; diff --git a/security/mac_process.c b/security/mac_process.c index 20ca2fb64..4ed4d53b7 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -366,6 +366,21 @@ mac_proc_check_mprotect(proc_t proc, return (error); } +int +mac_proc_check_map_prot_copy_allow(proc_t proc) +{ + kauth_cred_t cred; + int error; + + if (!mac_vm_enforce) return (0); + + cred = kauth_cred_proc_ref(proc); + MAC_CHECK(proc_check_map_prot_copy_allow, cred, proc); + kauth_cred_unref(&cred); + + return (error); +} + int mac_proc_check_sched(proc_t curp, struct proc *proc) { diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 4316e2d3b..2bbfb04db 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -633,6 +633,19 @@ mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, return (error); } +int +mac_vnode_check_signature(struct vnode *vp, unsigned char *sha1, + void * signature, size_t size) +{ + int error; + + if (!mac_vnode_enforce || !mac_proc_enforce) + return (0); + + MAC_CHECK(vnode_check_signature, vp, vp->v_label, sha1, signature, size); + return (error); +} + #if 0 int mac_vnode_check_getacl(vfs_context_t ctx, struct vnode *vp, acl_type_t type) diff --git a/tools/tests/affinity/sets.c b/tools/tests/affinity/sets.c index a7115e56b..50eda2626 100644 --- a/tools/tests/affinity/sets.c +++ b/tools/tests/affinity/sets.c @@ -282,13 +282,15 @@ manager_fn(void *arg) return (void *) iteration; } +#define MAX_CACHE_DEPTH 10 static void auto_config(int npages, int *nbufs, int *nsets) { int len; int ncpu; - int64_t cacheconfig[10]; - int64_t cachesize[10]; + int llc; + int64_t cacheconfig[MAX_CACHE_DEPTH]; + int64_t cachesize[MAX_CACHE_DEPTH]; mutter("Autoconfiguring...\n"); @@ -305,21 +307,28 @@ auto_config(int npages, int *nbufs, int *nsets) exit(1); } + /* + * Find LLC + */ + for (llc = MAX_CACHE_DEPTH - 1; llc > 0; llc--) + if (cacheconfig[llc] != 0) + break; + /* * Calculate number of buffers of size pages*4096 bytes * fit into 90% of an L2 cache. */ - *nbufs = cachesize[2] * 9 / (npages * 4096 * 10); - mutter(" L2 cache %qd bytes: " + *nbufs = cachesize[llc] * 9 / (npages * 4096 * 10); + mutter(" L%d (LLC) cache %qd bytes: " "using %d buffers of size %d bytes\n", - cachesize[2], *nbufs, (npages * 4096)); + llc, cachesize[llc], *nbufs, (npages * 4096)); /* * Calcalute how many sets: */ - *nsets = cacheconfig[0]/cacheconfig[2]; - mutter(" %qd cpus; %qd cpus per L2 cache: using %d sets\n", - cacheconfig[0], cacheconfig[2], *nsets); + *nsets = cacheconfig[0]/cacheconfig[llc]; + mutter(" %qd cpus; %qd cpus per L%d cache: using %d sets\n", + cacheconfig[0], cacheconfig[llc], llc, *nsets); } void (*producer_fnp)(int *data, int isize) = &writer_fn;