From 7d720fa4bf6aa1a646bd21de64761f4d9fd1c4af Mon Sep 17 00:00:00 2001 From: Darwin Date: Mon, 29 Mar 2010 19:14:35 +0000 Subject: [PATCH] xnu-1504.3.12 Imported from https://opensource.apple.com/tarballs/xnu/xnu-1504.3.12.tar.gz --- bsd/conf/MASTER | 10 + bsd/conf/MASTER.i386 | 6 +- bsd/conf/MASTER.ppc | 2 +- bsd/conf/MASTER.x86_64 | 6 +- bsd/conf/files | 5 +- bsd/dev/i386/sysctl.c | 6 + bsd/hfs/hfs.h | 2 +- bsd/hfs/hfs_cnode.c | 18 +- bsd/hfs/hfs_cnode.h | 10 + bsd/hfs/hfs_readwrite.c | 5 +- bsd/hfs/hfs_vfsops.c | 635 ++-- bsd/hfs/hfs_vfsutils.c | 4 +- bsd/hfs/hfs_vnops.c | 145 +- bsd/hfs/hfs_xattr.c | 12 +- bsd/hfs/hfscommon/BTree/BTreeAllocate.c | 12 + bsd/kern/imageboot.c | 46 +- bsd/kern/kdebug.c | 10 +- bsd/kern/kern_event.c | 55 +- bsd/kern/kern_memorystatus.c | 6 +- bsd/kern/kern_mman.c | 30 +- bsd/kern/kern_resource.c | 133 +- bsd/kern/kern_sysctl.c | 50 + bsd/kern/pthread_synch.c | 114 +- bsd/kern/sys_generic.c | 6 +- bsd/kern/syscalls.master | 2 +- bsd/kern/uipc_usrreq.c | 51 +- bsd/net/Makefile | 3 +- bsd/net/bridge.c | 906 ------ bsd/net/bridge.h | 211 -- bsd/net/bridgestp.c | 1153 +++++++ bsd/net/dlil.c | 65 +- bsd/net/ether_at_pr_module.c | 6 +- bsd/net/ether_if_module.c | 12 +- bsd/net/ether_inet6_pr_module.c | 7 +- bsd/net/ether_inet_pr_module.c | 6 +- bsd/net/ethernet.h | 7 +- bsd/net/if.h | 32 +- bsd/net/if_bridge.c | 3847 +++++++++++++++++++++++ bsd/net/if_bridgevar.h | 484 +++ bsd/net/if_ethersubr.c | 6 +- bsd/net/if_llc.h | 111 +- bsd/net/if_types.h | 3 +- bsd/net/if_var.h | 3 +- bsd/net/if_vlan.c | 4 +- bsd/net/pf.c | 288 +- bsd/net/pf_ioctl.c | 7 +- bsd/net/pfvar.h | 1 + bsd/net/route.c | 2 + bsd/netinet/in_arp.c | 86 +- bsd/netinet/ip_dummynet.c | 29 +- bsd/netinet/ip_output.c | 28 +- bsd/netinet/tcp_input.c | 5 +- bsd/netinet6/in6.c | 23 +- bsd/netinet6/in6_ifattach.c | 1 + bsd/netinet6/in6_proto.c | 14 + bsd/netinet6/ip6_input.c | 6 + bsd/netinet6/ip6_mroute.c | 4 + bsd/netinet6/ip6_mroute.h | 2 + bsd/netinet6/ip6_output.c | 2 + bsd/netinet6/ip6_var.h | 3 +- bsd/netinet6/ipsec.c | 15 +- bsd/netinet6/mld6.c | 4 + bsd/netinet6/nd6.c | 6 +- bsd/netinet6/raw_ip6.c | 6 + bsd/nfs/nfs_socket.c | 4 +- bsd/sys/buf_internal.h | 1 + bsd/sys/kern_memorystatus.h | 4 + bsd/sys/mount.h | 3 + bsd/sys/mount_internal.h | 4 + bsd/sys/pthread_internal.h | 2 +- bsd/sys/resource.h | 4 +- bsd/sys/socketvar.h | 4 +- bsd/sys/sockio.h | 18 +- bsd/sys/ubc_internal.h | 5 +- bsd/sys/vnode_internal.h | 4 + bsd/vfs/vfs_bio.c | 8 +- bsd/vfs/vfs_cluster.c | 18 +- bsd/vfs/vfs_conf.c | 5 + bsd/vfs/vfs_subr.c | 57 +- bsd/vfs/vfs_syscalls.c | 400 +++ config/Makefile | 13 + config/MasterVersion | 2 +- config/generate_linker_exports.sh | 15 + iokit/Kernel/IODMACommand.cpp | 3 +- iokit/Kernel/IOServicePM.cpp | 11 + kgmacros | 67 +- libkern/c++/OSKext.cpp | 89 +- libkern/conf/MASTER | 3 + libkern/conf/files | 1 + libkern/kxld/Makefile | 8 +- libkern/kxld/kxld_array.c | 2 +- libkern/kxld/kxld_demangle.c | 46 + libkern/kxld/kxld_demangle.h | 24 + libkern/kxld/kxld_kext.c | 56 +- libkern/kxld/kxld_util.h | 12 +- libkern/kxld/kxld_vtable.c | 54 +- libkern/libkern/OSAtomic.h | 2 +- libkern/libkern/c++/OSKext.h | 2 +- libkern/mkext.c | 22 +- libkern/zlib/adler32.c | 25 + libkern/zlib/arm/adler32vec.s | 428 +++ libkern/zlib/arm/inffastS.s | 571 ++++ libkern/zlib/inffast.c | 10 + makedefs/MakeInc.def | 5 +- makedefs/MakeInc.rule | 9 + osfmk/conf/MASTER | 4 + osfmk/console/panic_dialog.c | 36 +- osfmk/console/video_console.c | 33 + osfmk/i386/AT386/model_dep.c | 36 +- osfmk/i386/cpu_capabilities.h | 1 - osfmk/i386/cpuid.c | 3 +- osfmk/i386/cpuid.h | 7 +- osfmk/i386/lapic.c | 18 +- osfmk/i386/loose_ends.c | 15 +- osfmk/i386/pmCPU.c | 45 +- osfmk/i386/pmCPU.h | 5 +- osfmk/i386/pmap.c | 1259 +------- osfmk/i386/pmap.h | 5 +- osfmk/i386/pmap_internal.h | 690 +++- osfmk/i386/pmap_x86_common.c | 961 ++++++ osfmk/ipc/ipc_kmsg.c | 53 +- osfmk/ipc/ipc_kmsg.h | 13 +- osfmk/ipc/ipc_port.c | 19 +- osfmk/ipc/ipc_port.h | 4 +- osfmk/ipc/mach_port.c | 4 +- osfmk/kdp/kdp.c | 104 +- osfmk/kdp/kdp_dyld.h | 84 + osfmk/kdp/kdp_udp.c | 15 +- osfmk/kern/debug.c | 7 +- osfmk/kern/debug.h | 19 +- osfmk/kern/processor.c | 11 + osfmk/kern/processor.h | 15 +- osfmk/kern/sched.h | 1 + osfmk/kern/sched_prim.c | 57 +- osfmk/kern/sched_prim.h | 6 +- osfmk/kern/task_policy.c | 10 + osfmk/kern/thread.c | 4 +- osfmk/kern/thread.h | 2 + osfmk/kern/thread_call.c | 8 +- osfmk/mach/task_policy.h | 4 +- osfmk/mach/vm_prot.h | 8 + osfmk/ppc/machine_routines.c | 8 +- osfmk/vm/vm_fault.c | 2 +- osfmk/vm/vm_map.c | 97 +- osfmk/vm/vm_map.h | 6 + osfmk/x86_64/loose_ends.c | 15 +- osfmk/x86_64/pmap.c | 1362 +------- pexpert/gen/bootargs.c | 6 + pexpert/i386/pe_init.c | 11 +- pexpert/pexpert/pexpert.h | 8 + 150 files changed, 11165 insertions(+), 4716 deletions(-) delete mode 100644 bsd/net/bridge.c delete mode 100644 bsd/net/bridge.h create mode 100644 bsd/net/bridgestp.c create mode 100644 bsd/net/if_bridge.c create mode 100644 bsd/net/if_bridgevar.h create mode 100755 config/generate_linker_exports.sh create mode 100644 libkern/kxld/kxld_demangle.c create mode 100644 libkern/kxld/kxld_demangle.h create mode 100644 libkern/zlib/arm/adler32vec.s create mode 100644 libkern/zlib/arm/inffastS.s create mode 100644 osfmk/kdp/kdp_dyld.h diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index ec9ff0940..36c667094 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -195,6 +195,7 @@ options QUOTA # file system quotas # options REV_ENDIAN_FS # Reverse Endian FS # options NAMEDSTREAMS # named stream vnop support # options CONFIG_VOLFS # volfs path support (legacy) # +options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # # # NFS support @@ -245,6 +246,8 @@ options randomipid # options ZLIB # inflate/deflate support # +options IF_BRIDGE # + makeoptions LIBDRIVER = "libDriver_kern.o" # makeoptions LIBOBJC = "libkobjc.o" # @@ -306,6 +309,9 @@ options CONFIG_VFS_NAMES=4096 # options CONFIG_VFS_NAMES=3072 # options CONFIG_VFS_NAMES=2048 # +options CONFIG_MAX_CLUSTERS=8 # +options CONFIG_MAX_CLUSTERS=4 # + # # configurable kauth credential related resources # @@ -409,6 +415,10 @@ options CONFIG_EMBEDDED # # options CONFIG_ENFORCE_SIGNED_CODE # +# support dynamic signing of code +# +options CONFIG_DYNAMIC_CODE_SIGNING # + # # code decryption... used on embedded for app protection # must be set in all the bsd/conf and osfmk/conf MASTER files diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index 08eca2cbc..b953aaed9 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -45,8 +45,8 @@ # Standard Apple Research Configurations: # -------- ----- -------- --------------- # BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ] -# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression config_imgsrc_access ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] @@ -56,7 +56,7 @@ # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] # DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert config_dtrace ] # diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc index 2a0846433..54ba3d565 100644 --- a/bsd/conf/MASTER.ppc +++ b/bsd/conf/MASTER.ppc @@ -47,7 +47,7 @@ # # BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] # FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ] -# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] diff --git a/bsd/conf/MASTER.x86_64 b/bsd/conf/MASTER.x86_64 index dd1f24e96..3815e81f0 100644 --- a/bsd/conf/MASTER.x86_64 +++ b/bsd/conf/MASTER.x86_64 @@ -45,8 +45,8 @@ # Standard Apple Research Configurations: # -------- ----- -------- --------------- # BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ] -# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression config_imgsrc_access ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] @@ -56,7 +56,7 @@ # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] # DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert ] # diff --git a/bsd/conf/files b/bsd/conf/files index 95f856c21..fce436ec6 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -107,7 +107,7 @@ OPTIONS/ipfw2 optional ipfw2 OPTIONS/ipfirewall optional ipfirewall OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug -OPTIONS/bridge optional bridge +OPTIONS/if_bridge optional if_bridge OPTIONS/faith optional faith OPTIONS/gif optional gif OPTIONS/netat optional netat @@ -200,7 +200,8 @@ bsd/kern/decmpfs.c standard bsd/net/bpf.c optional bpfilter bsd/net/bpf_filter.c optional bpfilter -bsd/net/bridge.c optional bridge +bsd/net/if_bridge.c optional if_bridge +bsd/net/bridgestp.c optional if_bridge bsd/net/bsd_comp.c optional ppp_bsdcomp bsd/net/if.c optional networking bsd/net/if_atmsubr.c optional atm diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 2b9609d53..597a208c1 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -314,6 +314,12 @@ SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, dynamic_acceleration, sizeof(boolean_t), cpu_thermal, "I", "Dynamic Acceleration Technology (Turbo Mode)"); +SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, invariant_APIC_timer, + CTLTYPE_INT | CTLFLAG_RD, + (void *)offsetof(cpuid_thermal_leaf_t, invariant_APIC_timer), + sizeof(boolean_t), + cpu_thermal, "I", "Invariant APIC Timer"); + SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, thresholds, CTLTYPE_INT | CTLFLAG_RD, (void *)offsetof(cpuid_thermal_leaf_t, thresholds), diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 67ecb6d1e..beb10099f 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -755,7 +755,7 @@ extern int hfs_btsync(struct vnode *vp, int sync_transaction); extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, int can_drop_lock); + struct vnode **rvpp, int can_drop_lock, int error_on_unlinked); extern int hfs_update(struct vnode *, int); diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index 7123f603f..c17c8d4dd 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -59,6 +59,10 @@ static void hfs_reclaim_cnode(struct cnode *); static int hfs_isordered(struct cnode *, struct cnode *); +inline int hfs_checkdeleted (struct cnode *cp) { + return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); +} + /* * Last reference to an cnode. If necessary, write or delete it. @@ -195,7 +199,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { struct vnode *rvp = NULLVP; - error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE, FALSE); if (error) goto out; /* @@ -612,9 +616,15 @@ hfs_getnewvnode( return (ENOENT); } - /* Hardlinks may need an updated catalog descriptor */ - if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { - replace_desc(cp, descp); + /* + * Hardlinks may need an updated catalog descriptor. However, if + * the cnode has already been marked as open-unlinked (C_DELETED), then don't + * replace its descriptor. + */ + if (!(hfs_checkdeleted(cp))) { + if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { + replace_desc(cp, descp); + } } /* Check if we found a matching vnode */ if (*vpp != NULL) diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 27c1b9a55..9ffb9a8ca 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -227,6 +227,16 @@ enum { kFinderInvisibleMask = 1 << 14 }; FTOC(fp)->c_rsrc_vp : \ FTOC(fp)->c_vp) +/* + * This is a helper function used for determining whether or not a cnode has become open + * unlinked in between the time we acquired its vnode and the time we acquire the cnode lock + * to start manipulating it. Due to the SMP nature of VFS, it is probably necessary to + * use this macro every time we acquire a cnode lock, as the content of the Cnode may have + * been modified in betweeen the lookup and a VNOP. Whether or not to call this is dependent + * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in, + * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed. + */ +int hfs_checkdeleted (struct cnode *cp); /* * Test for a resource fork diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 6dc30afad..97578830d 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -3296,6 +3296,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) vm_offset_t a_pl_offset; int a_flags; int is_pageoutv2 = 0; + kern_return_t kret; cp = VTOC(vp); fp = VTOF(vp); @@ -3339,9 +3340,9 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) else { request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; } - ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); - if (upl == (upl_t) NULL) { + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { retval = EINVAL; goto pageout_done; } diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 8148697b2..de087422b 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2009 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -114,6 +114,8 @@ int hfs_dbg_all = 0; int hfs_dbg_err = 0; #endif +/* Enable/disable debugging code for live volume resizing */ +int hfs_resize_debug = 0; lck_grp_attr_t * hfs_group_attr; lck_attr_t * hfs_lock_attr; @@ -146,8 +148,7 @@ static int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context); -static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, - u_int32_t catblks, u_int32_t fileID, int rsrcfork); +static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID); static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); @@ -3803,17 +3804,18 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) u_int32_t reclaimblks = 0; int lockflags = 0; int transaction_begun = 0; + Boolean updateFreeBlocks = false; int error; - lck_mtx_lock(&hfsmp->hfs_mutex); + HFS_MOUNT_LOCK(hfsmp, TRUE); if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - lck_mtx_unlock(&hfsmp->hfs_mutex); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); return (EALREADY); } hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; hfsmp->hfs_resize_filesmoved = 0; hfsmp->hfs_resize_totalfiles = 0; - lck_mtx_unlock(&hfsmp->hfs_mutex); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); /* * - Journaled HFS Plus volumes only. @@ -3828,18 +3830,23 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) newblkcnt = newsize / hfsmp->blockSize; reclaimblks = hfsmp->totalBlocks - newblkcnt; + if (hfs_resize_debug) { + printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1)); + printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks); + } + /* Make sure new size is valid. */ if ((newsize < HFS_MIN_SIZE) || (newsize >= oldsize) || (newsize % hfsmp->hfs_logical_block_size) || (newsize % hfsmp->hfs_physical_block_size)) { - printf ("hfs_truncatefs: invalid size\n"); + printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize); error = EINVAL; goto out; } - /* Make sure there's enough space to work with. */ + /* Make sure that the file system has enough free blocks reclaim */ if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { - printf("hfs_truncatefs: insufficient space (need %u blocks; have %u blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); + printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); error = ENOSPC; goto out; } @@ -3862,17 +3869,21 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * in the allocation blocks beyond (i.e. the blocks we're trying to * truncate away. */ - lck_mtx_lock(&hfsmp->hfs_mutex); + HFS_MOUNT_LOCK(hfsmp, TRUE); if (hfsmp->blockSize == 512) hfsmp->allocLimit = newblkcnt - 2; else hfsmp->allocLimit = newblkcnt - 1; + /* Update the volume free block count to reflect the total number of + * free blocks that will exist after a successful resize. + */ hfsmp->freeBlocks -= reclaimblks; - lck_mtx_unlock(&hfsmp->hfs_mutex); - + updateFreeBlocks = true; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + /* * Look for files that have blocks at or beyond the location of the - * new alternate volume header. + * new alternate volume header */ if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { /* @@ -3883,8 +3894,9 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) transaction_begun = 0; /* Attempt to reclaim some space. */ - if (hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context) != 0) { - printf("hfs_truncatefs: couldn't reclaim space on %s\n", hfsmp->vcbVN); + error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context); + if (error != 0) { + printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error); error = ENOSPC; goto out; } @@ -3895,8 +3907,9 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) transaction_begun = 1; /* Check if we're clear now. */ - if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { - printf("hfs_truncatefs: didn't reclaim enough space on %s\n", hfsmp->vcbVN); + error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks); + if (error != 0) { + printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error); error = EAGAIN; /* tell client to try again */ goto out; } @@ -3933,14 +3946,16 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * since this block will be outside of the truncated file system! */ if (hfsmp->hfs_alt_id_sector) { - if (buf_meta_bread(hfsmp->hfs_devvp, + error = buf_meta_bread(hfsmp->hfs_devvp, HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys), - hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) { - + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if (error == 0) { bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize); (void) VNOP_BWRITE(bp); - } else if (bp) { - buf_brelse(bp); + } else { + if (bp) { + buf_brelse(bp); + } } bp = NULL; } @@ -3963,7 +3978,7 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * TODO: Adjust the size of the metadata zone based on new volume size? */ - + /* * Adjust the size of hfsmp->hfs_attrdata_vp */ @@ -3985,15 +4000,14 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } out: - if (error) - hfsmp->freeBlocks += reclaimblks; - lck_mtx_lock(&hfsmp->hfs_mutex); + if (error && (updateFreeBlocks == true)) + hfsmp->freeBlocks += reclaimblks; hfsmp->allocLimit = hfsmp->totalBlocks; if (hfsmp->nextAllocation >= hfsmp->allocLimit) hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1; hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - lck_mtx_unlock(&hfsmp->hfs_mutex); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -4001,6 +4015,8 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) if (transaction_begun) { hfs_end_transaction(hfsmp); hfs_journal_flush(hfsmp); + /* Just to be sure, sync all data to the disk */ + (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context); } return (error); @@ -4077,18 +4093,6 @@ hfs_copy_extent( if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread()) panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); - /* - * Wait for any in-progress writes to this vnode to complete, so that we'll - * be copying consistent bits. (Otherwise, it's possible that an async - * write will complete to the old extent after we read from it. That - * could lead to corruption.) - */ - err = vnode_waitforwrites(vp, 0, 0, 0, "hfs_copy_extent"); - if (err) { - printf("hfs_copy_extent: Error %d from vnode_waitforwrites\n", err); - return err; - } - /* * Determine the I/O size to use * @@ -4134,7 +4138,7 @@ hfs_copy_extent( buf_setcount(bp, ioSize); buf_setblkno(bp, destSector); buf_setlblkno(bp, destSector); - if (journal_uses_fua(hfsmp->jnl)) + if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl)) buf_markfua(bp); /* Do the write */ @@ -4157,7 +4161,7 @@ hfs_copy_extent( kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize); /* Make sure all writes have been flushed to disk. */ - if (!journal_uses_fua(hfsmp->jnl)) { + if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) { err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context); if (err) { printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err); @@ -4172,8 +4176,15 @@ hfs_copy_extent( } +static int +hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state) +{ + bcopy(state, record, sizeof(HFSPlusExtentRecord)); + return 0; +} + /* - * Reclaim space at the end of a volume, used by a given system file. + * Reclaim space at the end of a volume, used by a given file. * * This routine attempts to move any extent which contains allocation blocks * at or after "startblk." A separate transaction is used to do the move. @@ -4182,109 +4193,191 @@ hfs_copy_extent( * of a transaction have their physical block numbers invalidated so they will * eventually be written to their new locations. * - * This routine can be used to move overflow extents for the allocation file. - * * Inputs: * hfsmp The volume being resized. * startblk Blocks >= this allocation block need to be moved. * locks Which locks need to be taken for the given system file. * vp The vnode for the system file. * + * The caller of this function, hfs_reclaimspace(), grabs cnode lock + * for non-system files before calling this function. + * * Outputs: - * moved Set to true if any extents were moved. + * blks_moved Total number of allocation blocks moved by this routine. */ static int -hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state) -{ - bcopy(state, record, sizeof(HFSPlusExtentRecord)); - return 0; -} -static int -hfs_reclaim_sys_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, Boolean *moved, vfs_context_t context) +hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, u_int32_t *blks_moved, vfs_context_t context) { int error; int lockflags; int i; u_long datablks; - u_long block; + u_long end_block; u_int32_t oldStartBlock; u_int32_t newStartBlock; - u_int32_t blockCount; + u_int32_t oldBlockCount; + u_int32_t newBlockCount; struct filefork *fp; - + struct cnode *cp; + int is_sysfile; + int took_truncate_lock = 0; + struct BTreeIterator *iterator = NULL; + u_int8_t forktype; + u_int32_t fileID; + /* If there is no vnode for this file, then there's nothing to do. */ if (vp == NULL) return 0; - /* printf("hfs_reclaim_sys_file: %.*s\n", VTOC(vp)->c_desc.cd_namelen, VTOC(vp)->c_desc.cd_nameptr); */ + cp = VTOC(vp); + fileID = cp->c_cnid; + is_sysfile = vnode_issystem(vp); + forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0; + + /* Flush all the buffer cache blocks and cluster pages associated with + * this vnode. + * + * If the current vnode is a system vnode, all the buffer cache blocks + * associated with it should already be sync'ed to the disk as part of + * journal flush in hfs_truncatefs(). Normally there should not be + * buffer cache blocks for regular files, but for objects like symlinks, + * we can have buffer cache blocks associated with the vnode. Therefore + * we call buf_flushdirtyblks() always. Resource fork data for directory + * hard links are directly written using buffer cache for device vnode, + * which should also be sync'ed as part of journal flush in hfs_truncatefs(). + * + * Flushing cluster pages should be the normal case for regular files, + * and really should not do anything for system files. But just to be + * sure that all blocks associated with this vnode is sync'ed to the + * disk, we call both buffer cache and cluster layer functions. + */ + buf_flushdirtyblks(vp, MNT_NOWAIT, 0, "hfs_reclaim_file"); + if (!is_sysfile) { + /* The caller grabs cnode lock for non-system files only, therefore + * we unlock only non-system files before calling cluster layer. + */ + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + took_truncate_lock = 1; + } + (void) cluster_push(vp, 0); + if (!is_sysfile) { + error = hfs_lock(cp, HFS_FORCE_LOCK); + if (error) { + hfs_unlock_truncate(cp, TRUE); + return error; + } + + /* If the file no longer exists, nothing left to do */ + if (cp->c_flag & C_NOEXISTS) { + hfs_unlock_truncate(cp, TRUE); + return 0; + } + } + + /* Wait for any in-progress writes to this vnode to complete, so that we'll + * be copying consistent bits. (Otherwise, it's possible that an async + * write will complete to the old extent after we read from it. That + * could lead to corruption.) + */ + error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); + if (error) { + printf("hfs_reclaim_file: Error %d from vnode_waitforwrites\n", error); + return error; + } + + if (hfs_resize_debug) { + printf("hfs_reclaim_file: Start relocating %sfork for fileid=%u name=%.*s\n", (forktype ? "rsrc" : "data"), fileID, cp->c_desc.cd_namelen, cp->c_desc.cd_nameptr); + } + /* We always need the allocation bitmap and extents B-tree */ locks |= SFL_BITMAP | SFL_EXTENTS; error = hfs_start_transaction(hfsmp); if (error) { - printf("hfs_reclaim_sys_file: hfs_start_transaction returned %d\n", error); + printf("hfs_reclaim_file: hfs_start_transaction returned %d\n", error); + if (took_truncate_lock) { + hfs_unlock_truncate(cp, TRUE); + } return error; } lockflags = hfs_systemfile_lock(hfsmp, locks, HFS_EXCLUSIVE_LOCK); fp = VTOF(vp); datablks = 0; + *blks_moved = 0; /* Relocate non-overflow extents */ for (i = 0; i < kHFSPlusExtentDensity; ++i) { if (fp->ff_extents[i].blockCount == 0) break; oldStartBlock = fp->ff_extents[i].startBlock; - blockCount = fp->ff_extents[i].blockCount; - datablks += blockCount; - block = oldStartBlock + blockCount; - if (block > startblk) { - error = BlockAllocate(hfsmp, 1, blockCount, blockCount, true, true, &newStartBlock, &blockCount); + oldBlockCount = fp->ff_extents[i].blockCount; + datablks += oldBlockCount; + end_block = oldStartBlock + oldBlockCount; + /* Check if the file overlaps the target space */ + if (end_block > startblk) { + /* Allocate a new extent */ + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, true, (is_sysfile ? true : false), &newStartBlock, &newBlockCount); if (error) { - printf("hfs_reclaim_sys_file: BlockAllocate returned %d\n", error); + printf("hfs_reclaim_file: BlockAllocate (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); goto fail; } - if (blockCount != fp->ff_extents[i].blockCount) { - printf("hfs_reclaim_sys_file: new blockCount=%u, original blockCount=%u", blockCount, fp->ff_extents[i].blockCount); - goto free_fail; + if (newBlockCount != oldBlockCount) { + printf("hfs_reclaim_file: fileID=%u - newBlockCount=%u, oldBlockCount=%u", fileID, newBlockCount, oldBlockCount); + if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) { + hfs_mark_volume_inconsistent(hfsmp); + } + goto fail; } - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, blockCount, context); + + /* Copy data from old location to new location */ + error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); if (error) { - printf("hfs_reclaim_sys_file: hfs_copy_extent returned %d\n", error); - goto free_fail; + printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u %u:(%u,%u) to %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); + if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) { + hfs_mark_volume_inconsistent(hfsmp); + } + goto fail; } fp->ff_extents[i].startBlock = newStartBlock; - VTOC(vp)->c_flag |= C_MODIFIED; - *moved = true; - error = BlockDeallocate(hfsmp, oldStartBlock, blockCount); + cp->c_flag |= C_MODIFIED; + *blks_moved += newBlockCount; + + /* Deallocate the old extent */ + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount); if (error) { - /* TODO: Mark volume inconsistent? */ - printf("hfs_reclaim_sys_file: BlockDeallocate returned %d\n", error); + printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); goto fail; } - error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); - if (error) { - /* TODO: Mark volume inconsistent? */ - printf("hfs_reclaim_sys_file: hfs_flushvolumeheader returned %d\n", error); - goto fail; + + /* If this is a system file, sync the volume header on disk */ + if (is_sysfile) { + error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + if (error) { + printf("hfs_reclaim_file: hfs_flushvolumeheader returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + goto fail; + } + } + + if (hfs_resize_debug) { + printf ("hfs_reclaim_file: Relocated %u:(%u,%u) to %u:(%u,%u)\n", i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); } } } /* Relocate overflow extents (if any) */ if (i == kHFSPlusExtentDensity && fp->ff_blocks > datablks) { - struct BTreeIterator *iterator = NULL; struct FSBufferDescriptor btdata; HFSPlusExtentRecord record; HFSPlusExtentKey *key; FCB *fcb; - u_int32_t fileID; - u_int8_t forktype; + int overflow_count = 0; - forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0; - fileID = VTOC(vp)->c_cnid; if (kmem_alloc(kernel_map, (vm_offset_t*) &iterator, sizeof(*iterator))) { - printf("hfs_reclaim_sys_file: kmem_alloc failed!\n"); + printf("hfs_reclaim_file: kmem_alloc failed!\n"); error = ENOMEM; goto fail; } @@ -4305,40 +4398,51 @@ hfs_reclaim_sys_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); while (error == 0) { /* Stop when we encounter a different file or fork. */ - if ((key->fileID != fileID) || - (key->forkType != forktype)) { + if ((key->fileID != fileID) || + (key->forkType != forktype)) { break; } + + /* Just track the overflow extent record number for debugging... */ + if (hfs_resize_debug) { + overflow_count++; + } + /* * Check if the file overlaps target space. */ for (i = 0; i < kHFSPlusExtentDensity; ++i) { if (record[i].blockCount == 0) { - goto overflow_done; + goto fail; } oldStartBlock = record[i].startBlock; - blockCount = record[i].blockCount; - block = oldStartBlock + blockCount; - if (block > startblk) { - error = BlockAllocate(hfsmp, 1, blockCount, blockCount, true, true, &newStartBlock, &blockCount); + oldBlockCount = record[i].blockCount; + end_block = oldStartBlock + oldBlockCount; + if (end_block > startblk) { + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, true, (is_sysfile ? true : false), &newStartBlock, &newBlockCount); if (error) { - printf("hfs_reclaim_sys_file: BlockAllocate returned %d\n", error); - goto overflow_done; + printf("hfs_reclaim_file: BlockAllocate (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); + goto fail; } - if (blockCount != record[i].blockCount) { - printf("hfs_reclaim_sys_file: new blockCount=%u, original blockCount=%u", blockCount, fp->ff_extents[i].blockCount); - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - goto free_fail; + if (newBlockCount != oldBlockCount) { + printf("hfs_reclaim_file: fileID=%u - newBlockCount=%u, oldBlockCount=%u", fileID, newBlockCount, oldBlockCount); + if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) { + hfs_mark_volume_inconsistent(hfsmp); + } + goto fail; } - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, blockCount, context); + error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); if (error) { - printf("hfs_reclaim_sys_file: hfs_copy_extent returned %d\n", error); - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - goto free_fail; + printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u (%u,%u) to (%u,%u)\n", error, fileID, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) { + hfs_mark_volume_inconsistent(hfsmp); + } + goto fail; } record[i].startBlock = newStartBlock; - VTOC(vp)->c_flag |= C_MODIFIED; - *moved = true; + cp->c_flag |= C_MODIFIED; + *blks_moved += newBlockCount; + /* * NOTE: To support relocating overflow extents of the * allocation file, we must update the BTree record BEFORE @@ -4349,15 +4453,18 @@ hfs_reclaim_sys_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, */ error = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr) hfs_relocate_callback, &record); if (error) { - /* TODO: Mark volume inconsistent? */ - printf("hfs_reclaim_sys_file: BTUpdateRecord returned %d\n", error); - goto overflow_done; + printf("hfs_reclaim_file: BTUpdateRecord returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + goto fail; } - error = BlockDeallocate(hfsmp, oldStartBlock, blockCount); + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount); if (error) { - /* TODO: Mark volume inconsistent? */ - printf("hfs_reclaim_sys_file: BlockDeallocate returned %d\n", error); - goto overflow_done; + printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + goto fail; + } + if (hfs_resize_debug) { + printf ("hfs_reclaim_file: Relocated overflow#%d %u:(%u,%u) to %u:(%u,%u)\n", overflow_count, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); } } } @@ -4368,26 +4475,29 @@ hfs_reclaim_sys_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, break; } } -overflow_done: - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - if (error) { - goto fail; - } } - hfs_systemfile_unlock(hfsmp, lockflags); - error = hfs_end_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_sys_file: hfs_end_transaction returned %d\n", error); +fail: + if (iterator) { + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); } - return error; - -free_fail: - (void) BlockDeallocate(hfsmp, newStartBlock, blockCount); -fail: (void) hfs_systemfile_unlock(hfsmp, lockflags); + + if ((*blks_moved != 0) && (is_sysfile == false)) { + (void) hfs_update(vp, MNT_WAIT); + } + (void) hfs_end_transaction(hfsmp); + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, TRUE); + } + + if (hfs_resize_debug) { + printf("hfs_reclaim_file: Finished relocating %sfork for fileid=%u (error=%d)\n", (forktype ? "rsrc" : "data"), fileID, error); + } + return error; } @@ -4453,6 +4563,7 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) { int error; int lockflags; + u_int32_t oldStartBlock; u_int32_t newStartBlock; u_int32_t oldBlockCount; u_int32_t newBlockCount; @@ -4493,6 +4604,7 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error); goto free_fail; } + oldStartBlock = journal_fork.cf_extents[0].startBlock; journal_fork.cf_size = newBlockCount * hfsmp->blockSize; journal_fork.cf_extents[0].startBlock = newStartBlock; journal_fork.cf_extents[0].blockCount = newBlockCount; @@ -4524,6 +4636,9 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error); } + if (!error && hfs_resize_debug) { + printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } return error; free_fail: @@ -4531,6 +4646,9 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) fail: hfs_systemfile_unlock(hfsmp, lockflags); (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_file: Error relocating journal file (error=%d)\n", error); + } return error; } @@ -4545,6 +4663,7 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) { int error; int lockflags; + u_int32_t oldBlock; u_int32_t newBlock; u_int32_t blockCount; struct cat_desc jib_desc; @@ -4608,6 +4727,7 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error); goto fail; } + oldBlock = jib_fork.cf_extents[0].startBlock; jib_fork.cf_size = hfsmp->blockSize; jib_fork.cf_extents[0].startBlock = newBlock; jib_fork.cf_extents[0].blockCount = 1; @@ -4635,6 +4755,10 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) if (error) { printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); } + + if (!error && hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + } return error; free_fail: @@ -4642,12 +4766,19 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) fail: hfs_systemfile_unlock(hfsmp, lockflags); (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + } return error; } /* * Reclaim space at the end of a file system. + * + * Inputs - + * startblk - start block of the space being reclaimed + * reclaimblks - number of allocation blocks to reclaim */ static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context) @@ -4663,45 +4794,53 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl int filecnt = 0; int maxfilecnt; u_int32_t block; - u_int32_t datablks; - u_int32_t rsrcblks; - u_int32_t blkstomove = 0; int lockflags; - int i; + int i, j; int error; int lastprogress = 0; - Boolean system_file_moved = false; + u_int32_t blks_moved = 0; + u_int32_t total_blks_moved = 0; + Boolean need_relocate; /* Relocate extents of the Allocation file if they're in the way. */ - error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &system_file_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &blks_moved, context); if (error) { printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error); return error; } + total_blks_moved += blks_moved; + /* Relocate extents of the Extents B-tree if they're in the way. */ - error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &system_file_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &blks_moved, context); if (error) { printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error); return error; } + total_blks_moved += blks_moved; + /* Relocate extents of the Catalog B-tree if they're in the way. */ - error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &system_file_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &blks_moved, context); if (error) { printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error); return error; } + total_blks_moved += blks_moved; + /* Relocate extents of the Attributes B-tree if they're in the way. */ - error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &system_file_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &blks_moved, context); if (error) { printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error); return error; } + total_blks_moved += blks_moved; + /* Relocate extents of the Startup File if there is one and they're in the way. */ - error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &system_file_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &blks_moved, context); if (error) { printf("hfs_reclaimspace: reclaim startup file returned %d\n", error); return error; } + total_blks_moved += blks_moved; /* * We need to make sure the alternate volume header gets flushed if we moved @@ -4709,12 +4848,13 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl * shrinking the size of the volume, or else the journal code will panic * with an invalid (too large) block number. * - * Note that system_file_moved will be set if ANY extent was moved, even + * Note that total_blks_moved will be set if ANY extent was moved, even * if it was just an overflow extent. In this case, the journal_flush isn't * strictly required, but shouldn't hurt. */ - if (system_file_moved) + if (total_blks_moved) { hfs_journal_flush(hfsmp); + } if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) > startblk) { error = hfs_reclaim_journal_file(hfsmp, context); @@ -4745,6 +4885,7 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl } saved_next_allocation = hfsmp->nextAllocation; + /* Always try allocating new blocks after the metadata zone */ HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_start); fcb = VTOF(hfsmp->hfs_catalog_vp); @@ -4763,7 +4904,8 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl } /* * Iterate over all the catalog records looking for files - * that overlap into the space we're trying to free up. + * that overlap into the space we're trying to free up and + * the total number of blocks that will require relocation. */ for (filecnt = 0; filecnt < maxfilecnt; ) { error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); @@ -4776,58 +4918,64 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl if (filerec.recordType != kHFSPlusFileRecord) { continue; } - datablks = rsrcblks = 0; - /* - * Check if either fork overlaps target space. - */ + + need_relocate = false; + /* Check if data fork overlaps the target space */ for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (filerec.dataFork.extents[i].blockCount != 0) { - datablks += filerec.dataFork.extents[i].blockCount; - block = filerec.dataFork.extents[i].startBlock + - filerec.dataFork.extents[i].blockCount; - if (block >= startblk) { - if ((filerec.fileID == hfsmp->hfs_jnlfileid) || - (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) { - printf("hfs_reclaimspace: cannot move active journal\n"); - error = EPERM; - goto end_iteration; - } - cnidbufp[filecnt++] = filerec.fileID; - blkstomove += filerec.dataFork.totalBlocks; - break; - } + if (filerec.dataFork.extents[i].blockCount == 0) { + break; } - if (filerec.resourceFork.extents[i].blockCount != 0) { - rsrcblks += filerec.resourceFork.extents[i].blockCount; - block = filerec.resourceFork.extents[i].startBlock + - filerec.resourceFork.extents[i].blockCount; - if (block >= startblk) { - cnidbufp[filecnt++] = filerec.fileID; - blkstomove += filerec.resourceFork.totalBlocks; - break; + block = filerec.dataFork.extents[i].startBlock + + filerec.dataFork.extents[i].blockCount; + if (block >= startblk) { + if ((filerec.fileID == hfsmp->hfs_jnlfileid) || + (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) { + printf("hfs_reclaimspace: cannot move active journal\n"); + error = EPERM; + goto end_iteration; } + need_relocate = true; + goto save_fileid; } } - /* - * Check for any overflow extents that overlap. - */ - if (i == kHFSPlusExtentDensity) { - if (filerec.dataFork.totalBlocks > datablks) { - if (hfs_overlapped_overflow_extents(hfsmp, startblk, datablks, filerec.fileID, 0)) { - cnidbufp[filecnt++] = filerec.fileID; - blkstomove += filerec.dataFork.totalBlocks; - } - } else if (filerec.resourceFork.totalBlocks > rsrcblks) { - if (hfs_overlapped_overflow_extents(hfsmp, startblk, rsrcblks, filerec.fileID, 1)) { - cnidbufp[filecnt++] = filerec.fileID; - blkstomove += filerec.resourceFork.totalBlocks; - } + + /* Check if resource fork overlaps the target space */ + for (j = 0; j < kHFSPlusExtentDensity; ++j) { + if (filerec.resourceFork.extents[j].blockCount == 0) { + break; + } + block = filerec.resourceFork.extents[j].startBlock + + filerec.resourceFork.extents[j].blockCount; + if (block >= startblk) { + need_relocate = true; + goto save_fileid; + } + } + + /* Check if any forks' overflow extents overlap the target space */ + if ((i == kHFSPlusExtentDensity) || (j == kHFSPlusExtentDensity)) { + if (hfs_overlapped_overflow_extents(hfsmp, startblk, filerec.fileID)) { + need_relocate = true; + goto save_fileid; + } + } + +save_fileid: + if (need_relocate == true) { + cnidbufp[filecnt++] = filerec.fileID; + if (hfs_resize_debug) { + printf ("hfs_reclaimspace: Will relocate extents for fileID=%u\n", filerec.fileID); } } } end_iteration: - if (filecnt == 0 && !system_file_moved) { + /* If no regular file was found to be relocated and + * no system file was moved, we probably do not have + * enough space to relocate the system files, or + * something else went wrong. + */ + if ((filecnt == 0) && (total_blks_moved == 0)) { printf("hfs_reclaimspace: no files moved\n"); error = ENOSPC; } @@ -4836,66 +4984,52 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl if (error || filecnt == 0) goto out; - /* - * Double check space requirements to make sure - * there is enough space to relocate any files - * that reside in the reclaim area. - * - * Blocks To Move -------------- - * | | | - * V V V - * ------------------------------------------------------------------------ - * | | / /// // | - * | | / /// // | - * | | / /// // | - * ------------------------------------------------------------------------ - * - * <------------------- New Total Blocks ------------------><-- Reclaim --> - * - * <------------------------ Original Total Blocks -----------------------> - * - */ - if (blkstomove >= hfs_freeblks(hfsmp, 1)) { - printf("hfs_truncatefs: insufficient space (need %u blocks; have %u blocks)\n", blkstomove, hfs_freeblks(hfsmp, 1)); - error = ENOSPC; - goto out; - } hfsmp->hfs_resize_filesmoved = 0; hfsmp->hfs_resize_totalfiles = filecnt; /* Now move any files that are in the way. */ for (i = 0; i < filecnt; ++i) { - struct vnode * rvp; - struct cnode * cp; + struct vnode *rvp; + struct cnode *cp; + struct filefork *datafork; if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0) continue; + + cp = VTOC(vp); + datafork = VTOF(vp); - /* Relocating directory hard links is not supported, so we - * punt (see radar 6217026). */ - cp = VTOC(vp); - if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) { - printf("hfs_reclaimspace: unable to relocate directory hard link %d\n", cp->c_cnid); - error = EINVAL; - goto out; - } - - /* Relocate any data fork blocks. */ - if (VTOF(vp) && VTOF(vp)->ff_blocks > 0) { - error = hfs_relocate(vp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc()); + /* Relocating directory hard links is not supported, so we punt (see radar 6217026). */ + if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) { + printf("hfs_reclaimspace: Unable to relocate directory hard link id=%d\n", cp->c_cnid); + error = EINVAL; + goto out; } - if (error) - break; - /* Relocate any resource fork blocks. */ - if ((cp->c_blocks - (VTOF(vp) ? VTOF((vp))->ff_blocks : 0)) > 0) { - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); - if (error) + /* Relocate any overlapping data fork blocks. */ + if (datafork && datafork->ff_blocks > 0) { + error = hfs_reclaim_file(hfsmp, vp, startblk, 0, &blks_moved, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); break; - error = hfs_relocate(rvp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc()); + } + total_blks_moved += blks_moved; + } + + /* Relocate any overlapping resource fork blocks. */ + if ((cp->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) { + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); + if (error) { + printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", cnidbufp[i], error); + break; + } + error = hfs_reclaim_file(hfsmp, rvp, startblk, 0, &blks_moved, context); VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; - if (error) + if (error) { + printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); break; + } + total_blks_moved += blks_moved; } hfs_unlock(cp); vnode_put(vp); @@ -4920,8 +5054,8 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl vp = NULL; } if (hfsmp->hfs_resize_filesmoved != 0) { - printf("hfs_reclaimspace: relocated %d files on \"%s\"\n", - (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN); + printf("hfs_reclaimspace: relocated %u blocks from %d files on \"%s\"\n", + total_blks_moved, (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN); } out: kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); @@ -4939,32 +5073,34 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl /* - * Check if there are any overflow extents that overlap. + * Check if there are any overflow data or resource fork extents that overlap + * into the disk space that is being reclaimed. + * + * Output - + * 1 - One of the overflow extents need to be relocated + * 0 - No overflow extents need to be relocated, or there was an error */ static int -hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t catblks, u_int32_t fileID, int rsrcfork) +hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID) { struct BTreeIterator * iterator = NULL; struct FSBufferDescriptor btdata; HFSPlusExtentRecord extrec; HFSPlusExtentKey *extkeyptr; FCB *fcb; - u_int32_t block; - u_int8_t forktype; int overlapped = 0; int i; int error; - forktype = rsrcfork ? 0xFF : 0; if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { - return (0); + return 0; } bzero(iterator, sizeof(*iterator)); extkeyptr = (HFSPlusExtentKey *)&iterator->key; extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength; - extkeyptr->forkType = forktype; + extkeyptr->forkType = 0; extkeyptr->fileID = fileID; - extkeyptr->startBlock = catblks; + extkeyptr->startBlock = 0; btdata.bufferAddress = &extrec; btdata.itemSize = sizeof(extrec); @@ -4972,32 +5108,41 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in fcb = VTOF(hfsmp->hfs_extents_vp); + /* This will position the iterator just before the first overflow + * extent record for given fileID. It will always return btNotFound, + * so we special case the error code. + */ error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); + if (error && (error != btNotFound)) { + goto out; + } + + /* BTIterateRecord() might return error if the btree is empty, and + * therefore we return that the extent does not overflow to the caller + */ + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); while (error == 0) { /* Stop when we encounter a different file. */ - if ((extkeyptr->fileID != fileID) || - (extkeyptr->forkType != forktype)) { + if (extkeyptr->fileID != fileID) { break; } - /* - * Check if the file overlaps target space. - */ + /* Check if any of the forks exist in the target space. */ for (i = 0; i < kHFSPlusExtentDensity; ++i) { if (extrec[i].blockCount == 0) { break; } - block = extrec[i].startBlock + extrec[i].blockCount; - if (block >= startblk) { + if ((extrec[i].startBlock + extrec[i].blockCount) >= startblk) { overlapped = 1; - break; + goto out; } } /* Look for more records. */ error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); } +out: kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - return (overlapped); + return overlapped; } diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 2485c73f6..307e2db66 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1493,7 +1493,7 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) /* * We don't bother taking the mount lock * to look at these values since the values - * themselves are each updated automically + * themselves are each updated atomically * on aligned addresses. */ freeblks = hfsmp->freeBlocks; diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 9114d0a99..eda49e242 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -812,8 +812,14 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) if (cp->c_blocks - VTOF(vp)->ff_blocks) { /* We deal with rsrc fork vnode iocount at the end of the function */ - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); if (error) { + /* + * hfs_vgetrsrc may have returned a vnode in rvp even though + * we got an error, because we specified error_on_unlinked. + * We need to drop the iocount after we release the cnode lock, so + * it will be taken care of at the end of the function if it's needed. + */ goto out; } @@ -2263,11 +2269,15 @@ hfs_vnop_remove(ap) if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK))) { return (error); } - - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); hfs_unlock(cp); if (error) { - return (error); + /* We may have gotten a rsrc vp out even though we got an error back. */ + if (rvp) { + vnode_put(rvp); + rvp = NULL; + } + return error; } drop_rsrc_vnode = 1; } @@ -2670,10 +2680,17 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (error && error != ENXIO && error != ENOENT && truncated) { if ((cp->c_datafork && cp->c_datafork->ff_size != 0) || (cp->c_rsrcfork && cp->c_rsrcfork->ff_size != 0)) { + off_t data_size = 0; + off_t rsrc_size = 0; + if (cp->c_datafork) { + data_size = cp->c_datafork->ff_size; + } + if (cp->c_rsrcfork) { + rsrc_size = cp->c_rsrcfork->ff_size; + } printf("hfs: remove: couldn't delete a truncated file (%s)" "(error %d, data sz %lld; rsrc sz %lld)", - cp->c_desc.cd_nameptr, error, cp->c_datafork->ff_size, - cp->c_rsrcfork->ff_size); + cp->c_desc.cd_nameptr, error, data_size, rsrc_size); hfs_mark_volume_inconsistent(hfsmp); } else { printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n", @@ -2850,10 +2867,17 @@ hfs_vnop_rename(ap) if ((error = hfs_lock (VTOC(fvp), HFS_EXCLUSIVE_LOCK))) { return (error); } - - error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE); + + /* + * We care if we race against rename/delete with this cnode, so we'll + * error out if this file becomes open-unlinked during this call. + */ + error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE, TRUE); hfs_unlock (VTOC(fvp)); if (error) { + if (fvp_rsrc) { + vnode_put (fvp_rsrc); + } return error; } } @@ -2865,13 +2889,30 @@ hfs_vnop_rename(ap) * grab the resource fork if the lock succeeded. */ if (hfs_lock (VTOC(tvp), HFS_EXCLUSIVE_LOCK) == 0) { - error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE); - hfs_unlock (VTOC(tvp)); + tcp = VTOC(tvp); + + /* + * We only care if we get an open-unlinked file on the dst so we + * know to null out tvp/tcp to make the rename operation act + * as if they never existed. Because they're effectively out of the + * namespace already it's fine to do this. If this is true, then + * make sure to unlock the cnode and drop the iocount only after the unlock. + */ + error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE, TRUE); + hfs_unlock (tcp); if (error) { - if (fvp_rsrc) { - vnode_put (fvp_rsrc); + /* + * Since we specify TRUE for error-on-unlinked in hfs_vgetrsrc, + * we can get a rsrc fork vp even if it returns an error. + */ + tcp = NULL; + tvp = NULL; + if (tvp_rsrc) { + vnode_put (tvp_rsrc); + tvp_rsrc = NULLVP; } - return error; + /* just bypass truncate lock and act as if we never got tcp/tvp */ + goto retry; } } } @@ -4282,22 +4323,48 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } -/* - * Return a referenced vnode for the resource fork - * - * cnode for vnode vp must already be locked. - * - * can_drop_lock is true if its safe to temporarily drop/re-acquire the cnode lock + +/* hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is + * found in 'vp'. The rsrc fork vnode is returned with the cnode locked and iocount + * on the rsrc vnode. + * + * *rvpp is an output argument for returning the pointer to the resource fork vnode. + * In most cases, the resource fork vnode will not be set if we return an error. + * However, if error_on_unlinked is set, we may have already acquired the resource fork vnode + * before we discover the error (the file has gone open-unlinked). In this case only, + * we may return a vnode in the output argument despite an error. + * + * If can_drop_lock is set, then it is safe for this function to temporarily drop + * and then re-acquire the cnode lock. We may need to do this, for example, in order to + * acquire an iocount or promote our lock. + * + * error_on_unlinked is an argument which indicates that we are to return an error if we + * discover that the cnode has gone into an open-unlinked state ( C_DELETED or C_NOEXISTS) + * is set in the cnode flags. This is only necessary if can_drop_lock is true, otherwise + * there's really no reason to double-check for errors on the cnode. */ + __private_extern__ int -hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int can_drop_lock) +hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, + struct vnode **rvpp, int can_drop_lock, int error_on_unlinked) { struct vnode *rvp; struct vnode *dvp = NULLVP; struct cnode *cp = VTOC(vp); int error; int vid; + int delete_status = 0; + + + /* + * Need to check the status of the cnode to validate it hasn't + * gone open-unlinked on us before we can actually do work with it. + */ + delete_status = hfs_checkdeleted (cp); + if ((delete_status) && (error_on_unlinked)) { + return delete_status; + } restart: /* Attempt to use exising vnode */ @@ -4324,6 +4391,32 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int if (can_drop_lock) { (void) hfs_lock(cp, HFS_FORCE_LOCK); + + /* + * When we relinquished our cnode lock, the cnode could have raced + * with a delete and gotten deleted. If the caller did not want + * us to ignore open-unlinked files, then re-check the C_DELETED + * state and see if we need to return an ENOENT here because the item + * got deleted in the intervening time. + */ + if (error_on_unlinked) { + if ((delete_status = hfs_checkdeleted(cp))) { + /* + * If error == 0, this means that we succeeded in acquiring an iocount on the + * rsrc fork vnode. However, if we're in this block of code, that + * means that we noticed that the cnode has gone open-unlinked. In + * this case, the caller requested that we not do any other work and + * return an errno. The caller will be responsible for dropping the + * iocount we just acquired because we can't do it until we've released + * the cnode lock. + */ + if (error == 0) { + *rvpp = rvp; + } + return delete_status; + } + } + /* * When our lock was relinquished, the resource fork * could have been recycled. Check for this and try @@ -4359,7 +4452,7 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int return (EINVAL); } /* - * If the upgrade fails we loose the lock and + * If the upgrade fails we lose the lock and * have to take the exclusive lock on our own. */ if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock) == FALSE) @@ -4372,9 +4465,17 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int * C_DELETED. This is because we need to continue to provide rsrc * fork access to open-unlinked files. In this case, build a fake descriptor * like in hfs_removefile. If we don't do this, buildkey will fail in - * cat_lookup because this cnode has no name in its descriptor. + * cat_lookup because this cnode has no name in its descriptor. However, + * only do this if the caller did not specify that they wanted us to + * error out upon encountering open-unlinked files. */ + if ((error_on_unlinked) && (can_drop_lock)) { + if ((error = hfs_checkdeleted (cp))) { + return error; + } + } + if ((cp->c_flag & C_DELETED ) && (cp->c_desc.cd_namelen == 0)) { bzero (&to_desc, sizeof(to_desc)); bzero (delname, 32); diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c index 915fbe874..f552b9c75 100644 --- a/bsd/hfs/hfs_xattr.c +++ b/bsd/hfs/hfs_xattr.c @@ -141,7 +141,7 @@ hfs_vnop_getnamedstream(struct vnop_getnamedstream_args* ap) hfs_unlock(cp); return (ENOATTR); } - error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE); + error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE, FALSE); hfs_unlock(cp); return (error); @@ -184,7 +184,7 @@ hfs_vnop_makenamedstream(struct vnop_makenamedstream_args* ap) if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { return (error); } - error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE); + error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE, FALSE); hfs_unlock(cp); return (error); @@ -328,7 +328,7 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) openunlinked = 1; } - result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); + result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); hfs_unlock(cp); if (result) { return (result); @@ -719,7 +719,7 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) openunlinked = 1; } - result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); + result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); hfs_unlock(cp); if (result) { return (result); @@ -1096,7 +1096,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) hfs_unlock(cp); return (ENOATTR); } - result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); + result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); hfs_unlock(cp); if (result) { return (result); @@ -2302,9 +2302,9 @@ free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *exte break; } (void)BlockDeallocate(hfsmp, extents[i].startBlock, extents[i].blockCount); + remblks -= extents[i].blockCount; extents[i].startBlock = 0; extents[i].blockCount = 0; - remblks -= extents[i].blockCount; #if HFS_XATTR_VERBOSE printf("hfs: free_attr_blks: BlockDeallocate [%d, %d]\n", diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c index 64c7b86f0..99d586408 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c +++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c @@ -696,6 +696,18 @@ BTZeroUnusedNodes(FCB *filePtr) goto ErrorExit; } + if (buf_flags(bp) & B_LOCKED) { + /* + * This node is already part of a transaction and will be + * written when the transaction is committed so don't write it here. + * If we did, then we'd hit a panic in hfs_vnop_bwrite since + * B_LOCKED is still set + */ + buf_brelse(bp); + continue; + } + + buf_clear(bp); buf_markaged(bp); diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 6325962b2..0ed79dc69 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -123,8 +123,8 @@ imageboot_setup() error = vfs_mountroot(); if (error == 0 && rootvnode != NULL) { - struct vnode *tvp; - struct vnode *newdp; + vnode_t newdp, old_rootvnode; + mount_t new_rootfs, old_rootfs; /* * Get the vnode for '/'. @@ -133,17 +133,45 @@ imageboot_setup() if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) panic("%s: cannot find root vnode", __FUNCTION__); + old_rootvnode = rootvnode; + old_rootfs = rootvnode->v_mount; + + mount_list_remove(old_rootfs); + + mount_lock(old_rootfs); +#ifdef CONFIG_IMGSRC_ACCESS + old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT; +#endif /* CONFIG_IMGSRC_ACCESS */ + old_rootfs->mnt_flag &= ~MNT_ROOTFS; + mount_unlock(old_rootfs); + + rootvnode = newdp; + + new_rootfs = rootvnode->v_mount; + mount_lock(new_rootfs); + new_rootfs->mnt_flag |= MNT_ROOTFS; + mount_unlock(new_rootfs); + vnode_ref(newdp); vnode_put(newdp); - tvp = rootvnode; - vnode_rele(tvp); filedesc0.fd_cdir = newdp; - rootvnode = newdp; - mount_list_lock(); - TAILQ_REMOVE(&mountlist, TAILQ_FIRST(&mountlist), mnt_list); - mount_list_unlock(); - mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; DBG_TRACE("%s: root switched\n", __FUNCTION__); + +#ifdef CONFIG_IMGSRC_ACCESS + if (PE_imgsrc_mount_supported()) { + imgsrc_rootvnode = old_rootvnode; + } else { + vnode_getalways(old_rootvnode); + vnode_rele(old_rootvnode); + vnode_put(old_rootvnode); + } +#else + vnode_getalways(old_rootvnode); + vnode_rele(old_rootvnode); + vnode_put(old_rootvnode); +#endif /* CONFIG_IMGSRC_ACCESS */ + + } done: FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index ee97c249c..bc3089a8f 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -168,10 +168,10 @@ static lck_mtx_t stackshot_subsys_mutex; void *stackshot_snapbuf = NULL; int -stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options, int32_t *retval); +stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset, int32_t *retval); extern void -kdp_snapshot_preflight(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t options); +kdp_snapshot_preflight(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset); extern int kdp_stack_snapshot_geterror(void); @@ -1705,11 +1705,11 @@ stack_snapshot(struct proc *p, register struct stack_snapshot_args *uap, int32_t return(error); return stack_snapshot2(uap->pid, uap->tracebuf, uap->tracebuf_size, - uap->options, retval); + uap->flags, uap->dispatch_offset, retval); } int -stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options, int32_t *retval) +stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset, int32_t *retval) { int error = 0; unsigned bytesTraced = 0; @@ -1730,7 +1730,7 @@ stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_ goto error_exit; } /* Preload trace parameters*/ - kdp_snapshot_preflight(pid, stackshot_snapbuf, tracebuf_size, options); + kdp_snapshot_preflight(pid, stackshot_snapbuf, tracebuf_size, flags, dispatch_offset); /* Trap to the debugger to obtain a coherent stack snapshot; this populates * the trace buffer diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 1a0f609ca..5d195dcf0 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -1564,6 +1564,17 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc knote_enqueue(kn); } + /* + * The user may change some filter values after the + * initial EV_ADD, but doing so will not reset any + * filter which have already been triggered. + */ + kn->kn_kevent.udata = kev->udata; + if (fops->f_isfd || fops->f_touch == NULL) { + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + } + /* * If somebody is in the middle of dropping this * knote - go find/insert a new one. But we have @@ -1578,17 +1589,11 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc } /* - * The user may change some filter values after the - * initial EV_ADD, but doing so will not reset any - * filter which have already been triggered. + * Call touch routine to notify filter of changes + * in filter values. */ - kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) fops->f_touch(kn, kev, EVENT_REGISTER); - else { - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - } /* We may need to push some info down to a networked filesystem */ if (kn->kn_filter == EVFILT_VNODE) { @@ -1680,13 +1685,10 @@ knote_process(struct knote *kn, } /* capture the kevent data - using touch if specified */ - if (result) { - if (touch) { - kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS); - } else { - kev = kn->kn_kevent; - } + if (result && touch) { + kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS); } + /* convert back to a kqlock - bail if the knote went away */ if (!knoteuse2kqlock(kq, kn)) { return EJUSTRETURN; @@ -1695,6 +1697,12 @@ knote_process(struct knote *kn, if (!(kn->kn_status & KN_ACTIVE)) { knote_activate(kn, 0); } + + /* capture all events that occurred during filter */ + if (!touch) { + kev = kn->kn_kevent; + } + } else if ((kn->kn_status & KN_STAYQUEUED) == 0) { /* was already dequeued, so just bail on this one */ return EJUSTRETURN; @@ -1724,21 +1732,26 @@ knote_process(struct knote *kn, if (result == 0) { return EJUSTRETURN; - } else if (kn->kn_flags & EV_ONESHOT) { + } else if ((kn->kn_flags & EV_ONESHOT) != 0) { knote_deactivate(kn); if (kqlock2knotedrop(kq, kn)) { kn->kn_fop->f_detach(kn); knote_drop(kn, p); } - } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { - knote_deactivate(kn); - /* manually clear knotes who weren't 'touch'ed */ - if ((touch == 0) && (kn->kn_flags & EV_CLEAR)) { + } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) { + if ((kn->kn_flags & EV_DISPATCH) != 0) { + /* deactivate and disable all dispatch knotes */ + knote_deactivate(kn); + kn->kn_status |= KN_DISABLED; + } else if (!touch || kn->kn_fflags == 0) { + /* only deactivate if nothing since the touch */ + knote_deactivate(kn); + } + if (!touch && (kn->kn_flags & EV_CLEAR) != 0) { + /* manually clear non-touch knotes */ kn->kn_data = 0; kn->kn_fflags = 0; } - if (kn->kn_flags & EV_DISPATCH) - kn->kn_status |= KN_DISABLED; kqunlock(kq); } else { /* diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index f5e141455..35e9a43a6 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -315,7 +315,7 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ #if DEBUG printf("set jetsam priority pids = { "); for (i = 0; i < jetsam_priority_list_count; i++) { - printf("%d ", temp_list[i].pid); + printf("(%d, 0x%08x, %d) ", temp_list[i].pid, temp_list[i].flags, temp_list[i].hiwat_pages); } printf("}\n"); #endif /* DEBUG */ @@ -326,6 +326,10 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ for (i = jetsam_priority_list_count; i < kMaxPriorityEntries; i++) { jetsam_priority_list[i].pid = 0; jetsam_priority_list[i].flags = 0; + jetsam_priority_list[i].hiwat_pages = -1; + jetsam_priority_list[i].hiwat_reserved1 = -1; + jetsam_priority_list[i].hiwat_reserved2 = -1; + jetsam_priority_list[i].hiwat_reserved3 = -1; } jetsam_priority_list_index = 0; lck_mtx_unlock(jetsam_list_mlock); diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 02166d578..6da43d2fd 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -690,7 +690,7 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) user_addr = (mach_vm_offset_t) uap->addr; user_size = (mach_vm_size_t) uap->len; - prot = (vm_prot_t)(uap->prot & VM_PROT_ALL); + prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED)); if (user_addr & PAGE_MASK_64) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ @@ -728,6 +728,34 @@ mprotect(__unused proc_t p, struct mprotect_args *uap, __unused int32_t *retval) if (error) return (error); #endif + + if(prot & VM_PROT_TRUSTED) { +#if CONFIG_DYNAMIC_CODE_SIGNING + /* CODE SIGNING ENFORCEMENT - JIT support */ + /* The special protection value VM_PROT_TRUSTED requests that we treat + * this page as if it had a valid code signature. + * If this is enabled, there MUST be a MAC policy implementing the + * mac_proc_check_mprotect() hook above. Otherwise, Codesigning will be + * compromised because the check would always succeed and thusly any + * process could sign dynamically. */ + result = vm_map_sign(user_map, + vm_map_trunc_page(user_addr), + vm_map_round_page(user_addr+user_size)); + switch (result) { + case KERN_SUCCESS: + break; + case KERN_INVALID_ADDRESS: + /* UNIX SPEC: for an invalid address range, return ENOMEM */ + return ENOMEM; + default: + return EINVAL; + } +#else + return ENOTSUP; +#endif + } + prot &= ~VM_PROT_TRUSTED; + result = mach_vm_protect(user_map, user_addr, user_size, FALSE, prot); switch (result) { diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index b51c4ecbe..02b61872a 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -109,7 +109,9 @@ int donice(struct proc *curp, struct proc *chgp, int n); int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); +static void do_background_socket(struct proc *curp, thread_t thread, int priority); static int do_background_thread(struct proc *curp, int priority); +static int do_background_task(struct proc *curp, int priority); rlim_t maxdmap = MAXDSIZ; /* XXX */ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ @@ -369,10 +371,35 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r return (EINVAL); } error = do_background_thread(curp, uap->prio); + (void) do_background_socket(curp, current_thread(), uap->prio); found++; break; } + case PRIO_DARWIN_PROCESS: { + if (uap->who == 0) + p = curp; + else { + p = proc_find(uap->who); + if (p == 0) + break; + refheld = 1; + } + + error = do_background_task(p, uap->prio); + (void) do_background_socket(p, NULL, uap->prio); + + proc_lock(p); + p->p_iopol_disk = (uap->prio == PRIO_DARWIN_BG ? + IOPOL_THROTTLE : IOPOL_DEFAULT); + proc_unlock(p); + + found++; + if (refheld != 0) + proc_rele(p); + break; + } + default: return (EINVAL); } @@ -427,20 +454,93 @@ donice(struct proc *curp, struct proc *chgp, int n) return (error); } +static int +do_background_task(struct proc *p, int priority) +{ + int error = 0; + task_category_policy_data_t info; + + if (priority & PRIO_DARWIN_BG) { + info.role = TASK_THROTTLE_APPLICATION; + } else { + info.role = TASK_DEFAULT_APPLICATION; + } + + error = task_policy_set(p->task, + TASK_CATEGORY_POLICY, + (task_policy_t) &info, + TASK_CATEGORY_POLICY_COUNT); + return (error); +} + +static void +do_background_socket(struct proc *curp, thread_t thread, int priority) +{ + struct filedesc *fdp; + struct fileproc *fp; + int i; + + if (priority & PRIO_DARWIN_BG) { + /* enable network throttle process-wide (if no thread is specified) */ + if (thread == NULL) { + proc_fdlock(curp); + fdp = curp->p_fd; + + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *sockp; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) { + continue; + } + sockp = (struct socket *)fp->f_fglob->fg_data; + sockp->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; + sockp->so_background_thread = NULL; + } + proc_fdunlock(curp); + } + + } else { + /* disable networking IO throttle. + * NOTE - It is a known limitation of the current design that we + * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for + * sockets created by other threads within this process. + */ + proc_fdlock(curp); + fdp = curp->p_fd; + for ( i = 0; i < fdp->fd_nfiles; i++ ) { + struct socket *sockp; + + fp = fdp->fd_ofiles[ i ]; + if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET ) { + continue; + } + sockp = (struct socket *)fp->f_fglob->fg_data; + /* skip if only clearing this thread's sockets */ + if ((thread) && (sockp->so_background_thread != thread)) { + continue; + } + sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND; + sockp->so_background_thread = NULL; + } + proc_fdunlock(curp); + } +} + + /* * do_background_thread * Returns: 0 Success * XXX - todo - does this need a MACF hook? */ static int -do_background_thread(struct proc *curp, int priority) +do_background_thread(struct proc *curp __unused, int priority) { - int i; thread_t thread; struct uthread *ut; thread_precedence_policy_data_t policy; - struct filedesc *fdp; - struct fileproc *fp; thread = current_thread(); ut = get_bsdthread_info(thread); @@ -461,31 +561,6 @@ do_background_thread(struct proc *curp, int priority) thread_policy_set( thread, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&policy, THREAD_PRECEDENCE_POLICY_COUNT ); - - /* disable networking IO throttle. - * NOTE - It is a known limitation of the current design that we - * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for - * sockets created by other threads within this process. - */ - proc_fdlock(curp); - fdp = curp->p_fd; - for ( i = 0; i < fdp->fd_nfiles; i++ ) { - struct socket *sockp; - - fp = fdp->fd_ofiles[ i ]; - if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 || - fp->f_fglob->fg_type != DTYPE_SOCKET ) { - continue; - } - sockp = (struct socket *)fp->f_fglob->fg_data; - if ( sockp->so_background_thread != thread ) { - continue; - } - sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND; - sockp->so_background_thread = NULL; - } - proc_fdunlock(curp); - return(0); } diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 7303287c1..842a3e572 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -2365,6 +2365,47 @@ SYSCTL_PROC(_kern, KERN_NETBOOT, netboot, 0, 0, sysctl_netboot, "I", ""); #endif +#ifdef CONFIG_IMGSRC_ACCESS +static int +sysctl_imgsrcdev +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + vfs_context_t ctx = vfs_context_current(); + vnode_t devvp; + int result; + + if (!vfs_context_issuser(ctx)) { + return EPERM; + } + + if (imgsrc_rootvnode == NULL) { + return ENOENT; + } + + result = vnode_getwithref(imgsrc_rootvnode); + if (result != 0) { + return result; + } + + devvp = vnode_mount(imgsrc_rootvnode)->mnt_devvp; + result = vnode_getwithref(devvp); + if (result != 0) { + goto out; + } + + result = sysctl_io_number(req, vnode_specrdev(devvp), sizeof(dev_t), NULL, NULL); + + vnode_put(devvp); +out: + vnode_put(imgsrc_rootvnode); + return result; +} + +SYSCTL_PROC(_kern, OID_AUTO, imgsrcdev, + CTLTYPE_INT | CTLFLAG_RD, + 0, 0, sysctl_imgsrcdev, "I", ""); +#endif /* CONFIG_IMGSRC_ACCESS */ + static int sysctl_usrstack (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -2815,3 +2856,12 @@ SYSCTL_INT (_kern, OID_AUTO, stack_size, SYSCTL_INT (_kern, OID_AUTO, stack_depth_max, CTLFLAG_RD, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch"); +/* + * enable back trace for port allocations + */ +extern int ipc_portbt; + +SYSCTL_INT(_kern, OID_AUTO, ipc_portbt, + CTLFLAG_RW | CTLFLAG_KERN, + &ipc_portbt, 0, ""); + diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 8c58b3ece..df178d791 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -136,6 +136,7 @@ static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, int reuse_thread, int wake_thread, int return_directly); static void wq_unpark_continue(void); +static void wq_unsuspend_continue(void); static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl); static boolean_t workqueue_addnewthread(struct workqueue *wq); static void workqueue_removethread(struct threadlist *tl); @@ -446,7 +447,6 @@ bsdthread_register(struct proc *p, struct bsdthread_register_args *uap, __unuse return(0); } - uint32_t wq_yielded_threshold = WQ_YIELDED_THRESHOLD; uint32_t wq_yielded_window_usecs = WQ_YIELDED_WINDOW_USECS; uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS; @@ -903,15 +903,11 @@ workqueue_callback(int type, thread_t thread) * the thread lock for the thread being UNBLOCKED * is also held */ - if (tl->th_suspended) { - OSAddAtomic(-1, &tl->th_suspended); - KERNEL_DEBUG1(0xefffd024, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread)); - } else { - OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]); + OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]); - KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread)); - } - break; + KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread)); + + break; } } @@ -986,7 +982,7 @@ workqueue_addnewthread(struct workqueue *wq) p = wq->wq_proc; workqueue_unlock(p); - kret = thread_create_workq(wq->wq_task, &th); + kret = thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th); if (kret != KERN_SUCCESS) goto failed; @@ -1046,7 +1042,6 @@ workqueue_addnewthread(struct workqueue *wq) tl->th_affinity_tag = -1; tl->th_priority = WORKQUEUE_NUMPRIOS; tl->th_policy = -1; - tl->th_suspended = 1; #if defined(__ppc__) //ml_fp_setvalid(FALSE); @@ -1057,7 +1052,7 @@ workqueue_addnewthread(struct workqueue *wq) uth->uu_threadlist = (void *)tl; workqueue_lock_spin(p); - + TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry); wq->wq_thidlecount++; @@ -1306,7 +1301,6 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in } - void workqueue_exit(struct proc *p) { @@ -1457,9 +1451,6 @@ workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item) return (error); } - - - static int workqueue_importance[WORKQUEUE_NUMPRIOS] = { 2, 0, -2, @@ -1710,14 +1701,11 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add tl->th_flags &= ~TH_LIST_SUSPENDED; reuse_thread = 0; - thread_sched_call(tl->th_thread, workqueue_callback); - } else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) { tl->th_flags &= ~TH_LIST_BLOCKED; - tl->th_flags |= TH_LIST_BUSY; wake_thread = 1; } - tl->th_flags |= TH_LIST_RUNNING; + tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY; wq->wq_threads_scheduled++; wq->wq_thscheduled_count[priority][affinity_tag]++; @@ -1894,6 +1882,80 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add } +static void +wq_unsuspend_continue(void) +{ + struct uthread *uth = NULL; + thread_t th_to_unsuspend; + struct threadlist *tl; + proc_t p; + + th_to_unsuspend = current_thread(); + uth = get_bsdthread_info(th_to_unsuspend); + + if (uth != NULL && (tl = uth->uu_threadlist) != NULL) { + + if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) { + /* + * most likely a normal resume of this thread occurred... + * it's also possible that the thread was aborted after we + * finished setting it up so that it could be dispatched... if + * so, thread_bootstrap_return will notice the abort and put + * the thread on the path to self-destruction + */ +normal_resume_to_user: + thread_sched_call(th_to_unsuspend, workqueue_callback); + + thread_bootstrap_return(); + } + /* + * if we get here, it's because we've been resumed due to + * an abort of this thread (process is crashing) + */ + p = current_proc(); + + workqueue_lock_spin(p); + + if (tl->th_flags & TH_LIST_SUSPENDED) { + /* + * thread has been aborted while still on our idle + * queue... remove it from our domain... + * workqueue_removethread consumes the lock + */ + workqueue_removethread(tl); + + thread_bootstrap_return(); + } + while ((tl->th_flags & TH_LIST_BUSY)) { + /* + * this thread was aborted after we started making + * it runnable, but before we finished dispatching it... + * we need to wait for that process to finish, + * and we need to ask for a wakeup instead of a + * thread_resume since the abort has already resumed us + */ + tl->th_flags |= TH_LIST_NEED_WAKEUP; + + assert_wait((caddr_t)tl, (THREAD_UNINT)); + + workqueue_unlock(p); + + thread_block(THREAD_CONTINUE_NULL); + + workqueue_lock_spin(p); + } + workqueue_unlock(p); + /* + * we have finished setting up the thread's context... + * thread_bootstrap_return will take us through the abort path + * where the thread will self destruct + */ + goto normal_resume_to_user; + } + thread_bootstrap_return(); +} + + static void wq_unpark_continue(void) { @@ -1996,11 +2058,19 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, } else { KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th)); - thread_resume(th); + workqueue_lock_spin(p); + + if (tl->th_flags & TH_LIST_NEED_WAKEUP) + wakeup(tl); + else + thread_resume(th); + + tl->th_flags &= ~(TH_LIST_BUSY | TH_LIST_NEED_WAKEUP); + + workqueue_unlock(p); } } - int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl) { diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 3da6c6d55..11a276bbd 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -239,8 +239,7 @@ pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *re donefileread(p, fp, fd); - if (!error) - KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE), + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE), uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); out: @@ -531,8 +530,7 @@ pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t * else fp_drop(p, fd, fp, 0); - if (!error) - KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE), + KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE), uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0); return(error); diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index d82fc7f83..5ab2dd50b 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -545,7 +545,7 @@ 362 AUE_KQUEUE ALL { int kqueue(void); } 363 AUE_NULL ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } 364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); } -365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options) NO_SYSCALL_STUB; } +365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) NO_SYSCALL_STUB; } #if CONFIG_WORKQUEUE 366 AUE_NULL ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, int pthsize,user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset) NO_SYSCALL_STUB; } 367 AUE_WORKQOPEN ALL { int workq_open(void) NO_SYSCALL_STUB; } diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index 26f38c8f5..202f2d858 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -780,6 +780,8 @@ unp_attach(struct socket *so) static void unp_detach(struct unpcb *unp) { + int so_locked = 1; + lck_rw_lock_exclusive(unp_list_mtx); LIST_REMOVE(unp, unp_link); lck_rw_done(unp_list_mtx); @@ -805,13 +807,46 @@ unp_detach(struct unpcb *unp) if (unp->unp_conn) unp_disconnect(unp); while (unp->unp_refs.lh_first) { - struct unpcb *unp2 = unp->unp_refs.lh_first; - socket_unlock(unp->unp_socket, 0); - - socket_lock(unp2->unp_socket, 1); - unp_drop(unp2, ECONNRESET); - socket_unlock(unp2->unp_socket, 1); + struct unpcb *unp2 = NULL; + + /* This datagram socket is connected to one or more + * sockets. In order to avoid a race condition between removing + * this reference and closing the connected socket, we need + * to check disconnect_in_progress + */ + if (so_locked == 1) { + socket_unlock(unp->unp_socket, 0); + so_locked = 0; + } + lck_mtx_lock(unp_disconnect_lock); + while (disconnect_in_progress != 0) { + (void)msleep((caddr_t)&disconnect_in_progress, unp_disconnect_lock, + PSOCK, "disconnect", NULL); + } + disconnect_in_progress = 1; + lck_mtx_unlock(unp_disconnect_lock); + + /* Now we are sure that any unpcb socket disconnect is not happening */ + if (unp->unp_refs.lh_first != NULL) { + unp2 = unp->unp_refs.lh_first; + socket_lock(unp2->unp_socket, 1); + } + + lck_mtx_lock(unp_disconnect_lock); + disconnect_in_progress = 0; + wakeup(&disconnect_in_progress); + lck_mtx_unlock(unp_disconnect_lock); + + if (unp2 != NULL) { + /* We already locked this socket and have a reference on it */ + unp_drop(unp2, ECONNRESET); + socket_unlock(unp2->unp_socket, 1); + } + } + + if (so_locked == 0) { socket_lock(unp->unp_socket, 0); + so_locked = 1; } soisdisconnected(unp->unp_socket); /* makes sure we're getting dealloced */ @@ -1160,9 +1195,7 @@ unp_connect2(struct socket *so, struct socket *so2) switch (so->so_type) { case SOCK_DGRAM: - lck_rw_lock_exclusive(unp_list_mtx); LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); - lck_rw_done(unp_list_mtx); /* Avoid lock order reversals due to drop/acquire in soisconnected. */ @@ -1292,9 +1325,7 @@ unp_disconnect(struct unpcb *unp) switch (unp->unp_socket->so_type) { case SOCK_DGRAM: - lck_rw_lock_exclusive(unp_list_mtx); LIST_REMOVE(unp, unp_reflink); - lck_rw_done(unp_list_mtx); unp->unp_socket->so_state &= ~SS_ISCONNECTED; socket_unlock(so2, 1); break; diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 920fbe064..1ea89d1cc 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -34,7 +34,8 @@ KERNELFILES= \ PRIVATE_DATAFILES = \ if_atm.h if_vlan_var.h if_ppp.h firewire.h \ ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \ - raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h + raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ + if_bridgevar.h PRIVATE_KERNELFILES = ${KERNELFILES} \ bpfdesc.h dlil_pvt.h ppp_comp.h \ diff --git a/bsd/net/bridge.c b/bsd/net/bridge.c deleted file mode 100644 index 01d3cb7f5..000000000 --- a/bsd/net/bridge.c +++ /dev/null @@ -1,906 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Luigi Rizzo - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: src/sys/net/bridge.c,v 1.16.2.14 2001/02/09 23:13:41 luigi Exp $ - */ - -/* - * This code implements bridging in FreeBSD. It only acts on ethernet - * type of interfaces (others are still usable for routing). - * A bridging table holds the source MAC address/dest. interface for each - * known node. The table is indexed using an hash of the source address. - * - * Input packets are tapped near the beginning of ether_input(), and - * analysed by calling bridge_in(). Depending on the result, the packet - * can be forwarded to one or more output interfaces using bdg_forward(), - * and/or sent to the upper layer (e.g. in case of multicast). - * - * Output packets are intercepted near the end of ether_output(), - * the correct destination is selected calling bridge_dst_lookup(), - * and then forwarding is done using bdg_forward(). - * Bridging is controlled by the sysctl variable net.link.ether.bridge - * - * The arp code is also modified to let a machine answer to requests - * irrespective of the port the request came from. - * - * In case of loops in the bridging topology, the bridge detects this - * event and temporarily mutes output bridging on one of the ports. - * Periodically, interfaces are unmuted by bdg_timeout(). - * Muting is only implemented as a safety measure, and also as - * a mechanism to support a user-space implementation of the spanning - * tree algorithm. In the final release, unmuting will only occur - * because of explicit action of the user-level daemon. - * - * To build a bridging kernel, use the following option - * option BRIDGE - * and then at runtime set the sysctl variable to enable bridging. - * - * Only one interface is supposed to have addresses set (but - * there are no problems in practice if you set addresses for more - * than one interface). - * Bridging will act before routing, but nothing prevents a machine - * from doing both (modulo bugs in the implementation...). - * - * THINGS TO REMEMBER - * - bridging is incompatible with multicast routing on the same - * machine. There is not an easy fix to this. - * - loop detection is still not very robust. - * - the interface of bdg_forward() could be improved. - */ - -#include -#include -#include -#include -#include /* for net/if.h */ -#include -#include - -#include -#include - -#include /* for struct arpcom */ -#include -#include -#include -#include /* for struct arpcom */ - -#include "opt_ipfw.h" -#include "opt_ipdn.h" - -#if defined(IPFIREWALL) -#include -#include -#if defined(DUMMYNET) -#include -#endif -#endif - -#include - -/* - * For debugging, you can use the following macros. - * remember, rdtsc() only works on Pentium-class machines - - quad_t ticks; - DDB(ticks = rdtsc();) - ... interesting code ... - DDB(bdg_fw_ticks += (u_int32_t)(rdtsc() - ticks) ; bdg_fw_count++ ;) - - * - */ - -#define DDB(x) x -#define DEB(x) - -static void bdginit(void *); -static void bdgtakeifaces(void); -static void flush_table(void); -static void bdg_promisc_on(void); -static void parse_bdg_cfg(void); - -static int bdg_ipfw = 0 ; -int do_bridge = 0; -bdg_hash_table *bdg_table = NULL ; - -/* - * System initialization - */ - -SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, bdginit, NULL) - -static struct bdg_stats bdg_stats ; -struct bdg_softc *ifp2sc = NULL ; -/* XXX make it static of size BDG_MAX_PORTS */ - -#define IFP_CHK(ifp, x) \ - if (ifp2sc[ifp->if_index].magic != 0xDEADBEEF) { x ; } - -/* - * turn off promisc mode, optionally clear the IFF_USED flag. - * The flag is turned on by parse_bdg_config - */ -static void -bdg_promisc_off(int clear_used) -{ - struct ifnet *ifp ; - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if ( (ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { - int s, ret ; - s = splimp(); - ret = ifnet_set_promiscuous(ifp, 0); - splx(s); - ifp2sc[ifp->if_index].flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ; - DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n", - ifp->if_name, ifp->if_unit, - ifp->if_flags, ifp2sc[ifp->if_index].flags);) - } - if (clear_used) { - ifp2sc[ifp->if_index].flags &= ~(IFF_USED) ; - bdg_stats.s[ifp->if_index].name[0] = '\0'; - } - } - ifnet_head_done(); -} - -/* - * set promisc mode on the interfaces we use. - */ -static void -bdg_promisc_on() -{ - struct ifnet *ifp ; - int s ; - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if ( !BDG_USED(ifp) ) - continue ; - if ( 0 == ( ifp->if_flags & IFF_UP) ) { - s = splimp(); - if_up(ifp); - splx(s); - } - if ( !(ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) { - int ret ; - s = splimp(); - ret = ifnet_set_promiscuous(ifp, 1); - splx(s); - ifp2sc[ifp->if_index].flags |= IFF_BDG_PROMISC ; - printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n", - ifp->if_name, ifp->if_unit, - ifp->if_flags, ifp2sc[ifp->if_index].flags); - } - if (BDG_MUTED(ifp)) { - printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit); - BDG_UNMUTE(ifp) ; - } - } - ifnet_head_done(); -} - -static int -sysctl_bdg(SYSCTL_HANDLER_ARGS) -{ - int error, oldval = do_bridge ; - - error = sysctl_handle_int(oidp, - oidp->oid_arg1, oidp->oid_arg2, req); - DEB( printf("called sysctl for bridge name %s arg2 %d val %d->%d\n", - oidp->oid_name, oidp->oid_arg2, - oldval, do_bridge); ) - - if (bdg_table == NULL) - do_bridge = 0 ; - if (oldval != do_bridge) { - bdg_promisc_off( 1 ); /* reset previously used interfaces */ - flush_table(); - if (do_bridge) { - parse_bdg_cfg(); - bdg_promisc_on(); - } - } - return error ; -} - -static char bridge_cfg[256] = { "" } ; - -/* - * parse the config string, set IFF_USED, name and cluster_id - * for all interfaces found. - */ -static void -parse_bdg_cfg() -{ - char *p, *beg ; - int i, l, cluster; - struct bdg_softc *b; - - for (p= bridge_cfg; *p ; p++) { - /* interface names begin with [a-z] and continue up to ':' */ - if (*p < 'a' || *p > 'z') - continue ; - for ( beg = p ; *p && *p != ':' ; p++ ) - ; - if (*p == 0) /* end of string, ':' not found */ - return ; - l = p - beg ; /* length of name string */ - p++ ; - DEB(printf("-- match beg(%d) <%s> p <%s>\n", l, beg, p);) - for (cluster = 0 ; *p && *p >= '0' && *p <= '9' ; p++) - cluster = cluster*10 + (*p -'0'); - /* - * now search in bridge strings - */ - for (i=0, b = ifp2sc ; i < if_index ; i++, b++) { - char buf[32]; - struct ifnet *ifp = b->ifp ; - - if (ifp == NULL) - continue; - sprintf(buf, "%s%d", ifp->if_name, ifp->if_unit); - if (!strncmp(beg, buf, l)) { /* XXX not correct for >10 if! */ - b->cluster_id = htons(cluster) ; - b->flags |= IFF_USED ; - sprintf(bdg_stats.s[ifp->if_index].name, - "%s%d:%d", ifp->if_name, ifp->if_unit, cluster); - - DEB(printf("--++ found %s\n", - bdg_stats.s[ifp->if_index].name);) - break ; - } - } - if (*p == '\0') - break ; - } -} - -static int -sysctl_bdg_cfg(SYSCTL_HANDLER_ARGS) -{ - int error = 0 ; - char oldval[256] ; - - strlcpy(oldval, bridge_cfg, sizeof (oldval)); - - error = sysctl_handle_string(oidp, - bridge_cfg, oidp->oid_arg2, req); - DEB( - printf("called sysctl for bridge name %s arg2 %d err %d val %s->%s\n", - oidp->oid_name, oidp->oid_arg2, - error, - oldval, bridge_cfg); - ) - if (strcmp(oldval, bridge_cfg)) { - bdg_promisc_off( 1 ); /* reset previously-used interfaces */ - flush_table(); - parse_bdg_cfg(); /* and set new ones... */ - if (do_bridge) - bdg_promisc_on(); /* re-enable interfaces */ - } - return error ; -} - -static int -sysctl_refresh(SYSCTL_HANDLER_ARGS) -{ - if (req->newptr) - bdgtakeifaces(); - - return 0; -} - - -SYSCTL_DECL(_net_link_ether); -SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_cfg, CTLTYPE_STRING|CTLFLAG_RW, - &bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A", - "Bridge configuration"); - -SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge, CTLTYPE_INT|CTLFLAG_RW, - &do_bridge, 0, &sysctl_bdg, "I", "Bridging"); - -SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw, CTLFLAG_RW, - &bdg_ipfw,0,"Pass bridged pkts through firewall"); - -#define SY(parent, var, comment) \ - static int var ; \ - SYSCTL_INT(parent, OID_AUTO, var, CTLFLAG_RW, &(var), 0, comment); - -int bdg_ipfw_drops; -SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_drop, - CTLFLAG_RW, &bdg_ipfw_drops,0,""); - -int bdg_ipfw_colls; -SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_collisions, - CTLFLAG_RW, &bdg_ipfw_colls,0,""); - -SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_refresh, CTLTYPE_INT|CTLFLAG_WR, - NULL, 0, &sysctl_refresh, "I", "iface refresh"); - -#if 1 /* diagnostic vars */ - -SY(_net_link_ether, verbose, "Be verbose"); -SY(_net_link_ether, bdg_split_pkts, "Packets split in bdg_forward"); - -SY(_net_link_ether, bdg_thru, "Packets through bridge"); - -SY(_net_link_ether, bdg_copied, "Packets copied in bdg_forward"); - -SY(_net_link_ether, bdg_copy, "Force copy in bdg_forward"); -SY(_net_link_ether, bdg_predict, "Correctly predicted header location"); - -SY(_net_link_ether, bdg_fw_avg, "Cycle counter avg"); -SY(_net_link_ether, bdg_fw_ticks, "Cycle counter item"); -SY(_net_link_ether, bdg_fw_count, "Cycle counter count"); -#endif - -SYSCTL_STRUCT(_net_link_ether, PF_BDG, bdgstats, - CTLFLAG_RD, &bdg_stats , bdg_stats, "bridge statistics"); - -static int bdg_loops ; - -/* - * completely flush the bridge table. - */ -static void -flush_table() -{ - int s,i; - - if (bdg_table == NULL) - return ; - s = splimp(); - for (i=0; i< HASH_SIZE; i++) - bdg_table[i].name= NULL; /* clear table */ - splx(s); -} - -/* - * called periodically to flush entries etc. - */ -static void -bdg_timeout(void *dummy) -{ - static int slowtimer = 0 ; - - if (bdg_inted == 0) { - bdg_init2(0); - } else if (do_bridge) { - static int age_index = 0 ; /* index of table position to age */ - int l = age_index + HASH_SIZE/4 ; - /* - * age entries in the forwarding table. - */ - if (l > HASH_SIZE) - l = HASH_SIZE ; - for (; age_index < l ; age_index++) - if (bdg_table[age_index].used) - bdg_table[age_index].used = 0 ; - else if (bdg_table[age_index].name) { - /* printf("xx flushing stale entry %d\n", age_index); */ - bdg_table[age_index].name = NULL ; - } - if (age_index >= HASH_SIZE) - age_index = 0 ; - - if (--slowtimer <= 0 ) { - slowtimer = 5 ; - - bdg_promisc_on() ; /* we just need unmute, really */ - bdg_loops = 0 ; - } - } - timeout(bdg_timeout, (void *)0, 2*hz ); -} - -/* - * local MAC addresses are held in a small array. This makes comparisons - * much faster. - */ -bdg_addr bdg_addresses[BDG_MAX_PORTS]; -int bdg_ports ; - -/* - * initialization of bridge code. This needs to be done after all - * interfaces have been configured. - */ - -static int bdg_inited = 0; - -static void -bdg_init2(void) -{ - if (bdg_inited != 0) - return; - - if (bdg_table == NULL) { - bdg_table = (struct hash_table *) - _MALLOC(HASH_SIZE * sizeof(struct hash_table), - M_IFADDR, M_WAITOK); - if (bdg_table == NULL) - return; - - flush_table(); - } - - if (ifp2sc == NULL) { - ifp2sc = _MALLOC(BDG_MAX_PORTS * sizeof(struct bdg_softc), - M_IFADDR, M_WAITOK ); - if (ifp2sc == NULL) - return; - - bzero(ifp2sc, BDG_MAX_PORTS * sizeof(struct bdg_softc) ); - bdgtakeifaces(); - } - - bdg_inited = 1; -} - -static void -bdginit(void *dummy) -{ - /* Initialize first what can't fail */ - bzero(&bdg_stats, sizeof(bdg_stats) ); - do_bridge=0; - - /* Attempt to initialize the rest and start the timer */ - bdg_timeout(0); -} - -void -bdgtakeifaces(void) -{ - int i ; - struct ifnet *ifp; - bdg_addr *p = bdg_addresses ; - struct bdg_softc *bp; - - bdg_ports = 0 ; - *bridge_cfg = '\0'; - - printf("BRIDGE 010131, have %d interfaces\n", if_index); - ifnet_head_lock_shared(); - for (i = 0 , ifp = ifnet.tqh_first ; i < if_index ; - i++, ifp = TAILQ_NEXT(ifp, if_link) ) - if (ifp->if_type == IFT_ETHER) { /* ethernet ? */ - ifnet_lladdr_copy_bytes(ifp, p->etheraddr, ETHER_ADDR_LEN); - bp = &ifp2sc[ifp->if_index] ; - sprintf(bridge_cfg + strlen(bridge_cfg), - "%s%d:1,", ifp->if_name, ifp->if_unit); - printf("-- index %d %s type %d phy %d addrl %d addr %6D\n", - ifp->if_index, - bdg_stats.s[ifp->if_index].name, - (int)ifp->if_type, (int) ifp->if_physical, - (int)ifp->if_addrlen, - p->etheraddr, "." ); - p++ ; - bp->ifp = ifp ; - bp->flags = IFF_USED ; - bp->cluster_id = htons(1) ; - bp->magic = 0xDEADBEEF ; - - sprintf(bdg_stats.s[ifp->if_index].name, - "%s%d:%d", ifp->if_name, ifp->if_unit, - ntohs(bp->cluster_id)); - bdg_ports ++ ; - } - ifnet_head_done(); -} - -/* - * bridge_in() is invoked to perform bridging decision on input packets. - * - * On Input: - * eh Ethernet header of the incoming packet. - * - * On Return: destination of packet, one of - * BDG_BCAST broadcast - * BDG_MCAST multicast - * BDG_LOCAL is only for a local address (do not forward) - * BDG_DROP drop the packet - * ifp ifp of the destination interface. - * - * Forwarding is not done directly to give a chance to some drivers - * to fetch more of the packet, or simply drop it completely. - */ - -struct ifnet * -bridge_in(struct ifnet *ifp, struct ether_header *eh) -{ - int index; - struct ifnet *dst , *old ; - int dropit = BDG_MUTED(ifp) ; - - /* - * hash the source address - */ - index= HASH_FN(eh->ether_shost); - bdg_table[index].used = 1 ; - old = bdg_table[index].name ; - if ( old ) { /* the entry is valid. */ - IFP_CHK(old, printf("bridge_in-- reading table\n") ); - - if (!BDG_MATCH( eh->ether_shost, bdg_table[index].etheraddr) ) { - bdg_ipfw_colls++ ; - bdg_table[index].name = NULL ; - } else if (old != ifp) { - /* - * found a loop. Either a machine has moved, or there - * is a misconfiguration/reconfiguration of the network. - * First, do not forward this packet! - * Record the relocation anyways; then, if loops persist, - * suspect a reconfiguration and disable forwarding - * from the old interface. - */ - bdg_table[index].name = ifp ; /* relocate address */ - printf("-- loop (%d) %6D to %s%d from %s%d (%s)\n", - bdg_loops, eh->ether_shost, ".", - ifp->if_name, ifp->if_unit, - old->if_name, old->if_unit, - BDG_MUTED(old) ? "muted":"active"); - dropit = 1 ; - if ( !BDG_MUTED(old) ) { - if (++bdg_loops > 10) - BDG_MUTE(old) ; - } - } - } - - /* - * now write the source address into the table - */ - if (bdg_table[index].name == NULL) { - DEB(printf("new addr %6D at %d for %s%d\n", - eh->ether_shost, ".", index, ifp->if_name, ifp->if_unit);) - bcopy(eh->ether_shost, bdg_table[index].etheraddr, 6); - bdg_table[index].name = ifp ; - } - dst = bridge_dst_lookup(eh); - /* Return values: - * BDG_BCAST, BDG_MCAST, BDG_LOCAL, BDG_UNKNOWN, BDG_DROP, ifp. - * For muted interfaces, the first 3 are changed in BDG_LOCAL, - * and others to BDG_DROP. Also, for incoming packets, ifp is changed - * to BDG_DROP in case ifp == src . These mods are not necessary - * for outgoing packets from ether_output(). - */ - BDG_STAT(ifp, BDG_IN); - switch ((int)dst) { - case (int)BDG_BCAST: - case (int)BDG_MCAST: - case (int)BDG_LOCAL: - case (int)BDG_UNKNOWN: - case (int)BDG_DROP: - BDG_STAT(ifp, dst); - break ; - default : - if (dst == ifp || dropit ) - BDG_STAT(ifp, BDG_DROP); - else - BDG_STAT(ifp, BDG_FORWARD); - break ; - } - - if ( dropit ) { - if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_LOCAL) - return BDG_LOCAL ; - else - return BDG_DROP ; - } else { - return (dst == ifp ? BDG_DROP : dst ) ; - } -} - -/* - * Forward to dst, excluding src port and muted interfaces. - * If src == NULL, the pkt comes from ether_output, and dst is the real - * interface the packet is originally sent to. In this case we must forward - * it to the whole cluster. We never call bdg_forward ether_output on - * interfaces which are not part of a cluster. - * - * The packet is freed if possible (i.e. surely not of interest for - * the upper layer), otherwise a copy is left for use by the caller - * (pointer in m0). - * - * It would be more efficient to make bdg_forward() always consume - * the packet, leaving to the caller the task to check if it needs a copy - * and get one in case. As it is now, bdg_forward() can sometimes make - * a copy whereas it is not necessary. - * - * XXX be careful about eh, it can be a pointer into *m - */ -struct mbuf * -bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst) -{ - struct ifnet *src = m0->m_pkthdr.rcvif; /* could be NULL in output */ - struct ifnet *ifp, *last = NULL ; - int s ; - int shared = bdg_copy ; /* someone else is using the mbuf */ - int once = 0; /* loop only once */ - struct ifnet *real_dst = dst ; /* real dst from ether_output */ -#ifdef IPFIREWALL - struct ip_fw_chain *rule = NULL ; /* did we match a firewall rule ? */ -#endif - - /* - * XXX eh is usually a pointer within the mbuf (some ethernet drivers - * do that), so we better copy it before doing anything with the mbuf, - * or we might corrupt the header. - */ - struct ether_header save_eh = *eh ; - -#if defined(IPFIREWALL) && defined(DUMMYNET) - if (m0->m_type == MT_DUMMYNET) { - /* extract info from dummynet header */ - rule = (struct ip_fw_chain *)(m0->m_data) ; - m0 = m0->m_next ; - src = m0->m_pkthdr.rcvif; - shared = 0 ; /* For sure this is our own mbuf. */ - } else -#endif - bdg_thru++; /* only count once */ - - if (src == NULL) /* packet from ether_output */ - dst = bridge_dst_lookup(eh); - if (dst == BDG_DROP) { /* this should not happen */ - printf("xx bdg_forward for BDG_DROP\n"); - m_freem(m0); - return NULL; - } - if (dst == BDG_LOCAL) { /* this should not happen as well */ - printf("xx ouch, bdg_forward for local pkt\n"); - return m0; - } - if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) { - ifp = ifnet_head.tqh_first ; /* scan all ports */ - once = 0 ; - if (dst != BDG_UNKNOWN) /* need a copy for the local stack */ - shared = 1 ; - } else { - ifp = dst ; - once = 1 ; - } - if ( (u_int)(ifp) <= (u_int)BDG_FORWARD ) - panic("bdg_forward: bad dst"); - -#ifdef IPFIREWALL - /* - * Do filtering in a very similar way to what is done in ip_output. - * Only if firewall is loaded, enabled, and the packet is not - * from ether_output() (src==NULL, or we would filter it twice). - * Additional restrictions may apply e.g. non-IP, short packets, - * and pkts already gone through a pipe. - */ - if (ip_fw_chk_ptr && bdg_ipfw != 0 && src != NULL) { - struct ip *ip ; - int i; - - if (rule != NULL) /* dummynet packet, already partially processed */ - goto forward; /* HACK! I should obey the fw_one_pass */ - if (ntohs(save_eh.ether_type) != ETHERTYPE_IP) - goto forward ; /* not an IP packet, ipfw is not appropriate */ - if (m0->m_pkthdr.len < sizeof(struct ip) ) - goto forward ; /* header too short for an IP pkt, cannot filter */ - /* - * i need some amt of data to be contiguous, and in case others need - * the packet (shared==1) also better be in the first mbuf. - */ - i = min(m0->m_pkthdr.len, max_protohdr) ; - if ( shared || m0->m_len < i) { - m0 = m_pullup(m0, i) ; - if (m0 == NULL) { - printf("-- bdg: pullup failed.\n") ; - return NULL ; - } - } - - /* - * before calling the firewall, swap fields the same as IP does. - * here we assume the pkt is an IP one and the header is contiguous - */ - ip = mtod(m0, struct ip *); - NTOHS(ip->ip_len); - NTOHS(ip->ip_off); - - /* - * The third parameter to the firewall code is the dst. interface. - * Since we apply checks only on input pkts we use NULL. - * The firewall knows this is a bridged packet as the cookie ptr - * is NULL. - */ - i = (*ip_fw_chk_ptr)(&ip, 0, NULL, NULL /* cookie */, &m0, &rule, NULL); - if ( (i & IP_FW_PORT_DENY_FLAG) || m0 == NULL) /* drop */ - return m0 ; - /* - * If we get here, the firewall has passed the pkt, but the mbuf - * pointer might have changed. Restore ip and the fields NTOHS()'d. - */ - ip = mtod(m0, struct ip *); - HTONS(ip->ip_len); - HTONS(ip->ip_off); - - if (i == 0) /* a PASS rule. */ - goto forward ; -#ifdef DUMMYNET - if (i & IP_FW_PORT_DYNT_FLAG) { - /* - * Pass the pkt to dummynet, which consumes it. - * If shared, make a copy and keep the original. - * Need to prepend the ethernet header, optimize the common - * case of eh pointing already into the original mbuf. - */ - struct mbuf *m ; - if (shared) { - m = m_copypacket(m0, M_DONTWAIT); - if (m == NULL) { - printf("bdg_fwd: copy(1) failed\n"); - return m0; - } - } else { - m = m0 ; /* pass the original to dummynet */ - m0 = NULL ; /* and nothing back to the caller */ - } - if ( (void *)(eh + 1) == (void *)m->m_data) { - m->m_data -= ETHER_HDR_LEN ; - m->m_len += ETHER_HDR_LEN ; - m->m_pkthdr.len += ETHER_HDR_LEN ; - bdg_predict++; - } else { - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); - if (!m && verbose) printf("M_PREPEND failed\n"); - if (m == NULL) /* nope... */ - return m0 ; - bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); - } - dummynet_io((i & 0xffff),DN_TO_BDG_FWD,m,real_dst,NULL,0,rule,0); - return m0 ; - } -#endif - /* - * XXX add divert/forward actions... - */ - /* if none of the above matches, we have to drop the pkt */ - bdg_ipfw_drops++ ; - printf("bdg_forward: No rules match, so dropping packet!\n"); - return m0 ; - } -forward: -#endif /* IPFIREWALL */ - /* - * Again, bring up the headers in case of shared bufs to avoid - * corruptions in the future. - */ - if ( shared ) { - int i = min(m0->m_pkthdr.len, max_protohdr) ; - - m0 = m_pullup(m0, i) ; - if (m0 == NULL) { - printf("-- bdg: pullup2 failed.\n") ; - return NULL ; - } - } - /* now real_dst is used to determine the cluster where to forward */ - if (src != NULL) /* pkt comes from ether_input */ - real_dst = src ; - for (;;) { - if (last) { /* need to forward packet leftover from previous loop */ - struct mbuf *m ; - if (shared == 0 && once ) { /* no need to copy */ - m = m0 ; - m0 = NULL ; /* original is gone */ - } else { - m = m_copypacket(m0, M_DONTWAIT); - if (m == NULL) { - printf("bdg_forward: sorry, m_copypacket failed!\n"); - return m0 ; /* the original is still there... */ - } - } - /* - * Add header (optimized for the common case of eh pointing - * already into the mbuf) and execute last part of ether_output: - * queue pkt and start output if interface not yet active. - */ - if ( (void *)(eh + 1) == (void *)m->m_data) { - m->m_data -= ETHER_HDR_LEN ; - m->m_len += ETHER_HDR_LEN ; - m->m_pkthdr.len += ETHER_HDR_LEN ; - bdg_predict++; - } else { - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); - if (!m && verbose) printf("M_PREPEND failed\n"); - if (m == NULL) - return m0; - bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); - } - s = splimp(); - if (IF_QFULL(&last->if_snd)) { - IF_DROP(&last->if_snd); -#if 0 - BDG_MUTE(last); /* should I also mute ? */ -#endif - splx(s); - m_freem(m); /* consume the pkt anyways */ - } else { - last->if_obytes += m->m_pkthdr.len ; - if (m->m_flags & M_MCAST) - last->if_omcasts++; - if (m->m_pkthdr.len != m->m_len) /* this pkt is on >1 bufs */ - bdg_split_pkts++; - - IF_ENQUEUE(&last->if_snd, m); - if ((last->if_flags & IFF_OACTIVE) == 0) - (*last->if_start)(last); - splx(s); - } - BDG_STAT(last, BDG_OUT); - last = NULL ; - if (once) - break ; - } - if (ifp == NULL) - break ; - /* - * If the interface is used for bridging, not muted, not full, - * up and running, is not the source interface, and belongs to - * the same cluster as the 'real_dst', then send here. - */ - if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !IF_QFULL(&ifp->if_snd) && - (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) && - ifp != src && BDG_SAMECLUSTER(ifp, real_dst) ) - last = ifp ; - ifp = TAILQ_NEXT(ifp, if_link) ; - if (ifp == NULL) - once = 1 ; - } - DEB(bdg_fw_ticks += (u_int32_t)(rdtsc() - ticks) ; bdg_fw_count++ ; - if (bdg_fw_count != 0) bdg_fw_avg = bdg_fw_ticks/bdg_fw_count; ) - return m0 ; -} diff --git a/bsd/net/bridge.h b/bsd/net/bridge.h deleted file mode 100644 index faeff4283..000000000 --- a/bsd/net/bridge.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Luigi Rizzo - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ -#ifndef _NET_BRIDGE_H_ -#define _NET_BRIDGE_H_ -#include - -#warning This is not used by Darwin, do not include - -extern int do_bridge; -/* - * the hash table for bridge - */ -typedef struct hash_table { - struct ifnet *name ; - unsigned char etheraddr[6] ; - unsigned short used ; -} bdg_hash_table ; - -extern bdg_hash_table *bdg_table ; - -/* - * We need additional info for the bridge. The bdg_ifp2sc[] array - * provides a pointer to this struct using the if_index. - * bdg_softc has a backpointer to the struct ifnet, the bridge - * flags, and a cluster (bridging occurs only between port of the - * same cluster). - */ -struct bdg_softc { - struct ifnet *ifp ; - /* also ((struct arpcom *)ifp)->ac_enaddr is the eth. addr */ - int flags ; -#define IFF_BDG_PROMISC 0x0001 /* set promisc mode on this if. */ -#define IFF_MUTE 0x0002 /* mute this if for bridging. */ -#define IFF_USED 0x0004 /* use this if for bridging. */ - short cluster_id ; /* in network format */ - uint32_t magic; -} ; - -extern struct bdg_softc *ifp2sc; - -#define BDG_USED(ifp) (ifp2sc[ifp->if_index].flags & IFF_USED) -#define BDG_MUTED(ifp) (ifp2sc[ifp->if_index].flags & IFF_MUTE) -#define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE -#define BDG_UNMUTE(ifp) ifp2sc[ifp->if_index].flags &= ~IFF_MUTE -#define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster_id) - -#define BDG_SAMECLUSTER(ifp,src) \ - (src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) ) - - -#define BDG_MAX_PORTS 128 -typedef struct _bdg_addr { - unsigned char etheraddr[6] ; - short cluster_id ; -} bdg_addr ; -extern bdg_addr bdg_addresses[BDG_MAX_PORTS]; -extern int bdg_ports ; - -/* - * out of the 6 bytes, the last ones are more "variable". Since - * we are on a little endian machine, we have to do some gimmick... - */ -#define HASH_SIZE 8192 /* must be a power of 2 */ -#define HASH_FN(addr) ( \ - ntohs( ((short *)addr)[1] ^ ((short *)addr)[2] ) & (HASH_SIZE -1)) - -#define IFF_MUTE IFF_LINK2 /* will need a separate flag... */ - -struct ifnet *bridge_in(struct ifnet *ifp, struct ether_header *eh); -/* bdg_forward frees the mbuf if necessary, returning null */ -struct mbuf *bdg_forward(struct mbuf *m0, struct ether_header *eh, struct ifnet *dst); - -#ifdef __i386__ -#define BDG_MATCH(a,b) ( \ - ((unsigned short *)(a))[2] == ((unsigned short *)(b))[2] && \ - *((unsigned int *)(a)) == *((unsigned int *)(b)) ) -#define IS_ETHER_BROADCAST(a) ( \ - *((unsigned int *)(a)) == 0xffffffff && \ - ((unsigned short *)(a))[2] == 0xffff ) -#else -#warning... must complete these for the alpha etc. -#define BDG_MATCH(a,b) (!bcmp(a, b, ETHER_ADDR_LEN) ) -#endif -/* - * The following constants are not legal ifnet pointers, and are used - * as return values from the classifier, bridge_dst_lookup() - * The same values are used as index in the statistics arrays, - * with BDG_FORWARD replacing specifically forwarded packets. - */ -#define BDG_BCAST ( (struct ifnet *)1 ) -#define BDG_MCAST ( (struct ifnet *)2 ) -#define BDG_LOCAL ( (struct ifnet *)3 ) -#define BDG_DROP ( (struct ifnet *)4 ) -#define BDG_UNKNOWN ( (struct ifnet *)5 ) -#define BDG_IN ( (struct ifnet *)7 ) -#define BDG_OUT ( (struct ifnet *)8 ) -#define BDG_FORWARD ( (struct ifnet *)9 ) - -#define PF_BDG 3 /* XXX superhack */ -/* - * statistics, passed up with sysctl interface and ns -p bdg - */ - -#define STAT_MAX (int)BDG_FORWARD -struct bdg_port_stat { - char name[16]; - uint32_t collisions; - uint32_t p_in[STAT_MAX+1]; -} ; - -struct bdg_stats { - struct bdg_port_stat s[16]; -} ; - - -#define BDG_STAT(ifp, type) bdg_stats.s[ifp->if_index].p_in[(int)type]++ - -#ifdef KERNEL -/* - * Find the right pkt destination: - * BDG_BCAST is a broadcast - * BDG_MCAST is a multicast - * BDG_LOCAL is for a local address - * BDG_DROP must be dropped - * other ifp of the dest. interface (incl.self) - * - * We assume this is only called for interfaces for which bridging - * is enabled, i.e. BDG_USED(ifp) is true. - */ -static __inline -struct ifnet * -bridge_dst_lookup(struct ether_header *eh) -{ - struct ifnet *dst ; - int index ; - bdg_addr *p ; - - if (IS_ETHER_BROADCAST(eh->ether_dhost)) - return BDG_BCAST ; - if (eh->ether_dhost[0] & 1) - return BDG_MCAST ; - /* - * Lookup local addresses in case one matches. - */ - for (index = bdg_ports, p = bdg_addresses ; index ; index--, p++ ) - if (BDG_MATCH(p->etheraddr, eh->ether_dhost) ) - return BDG_LOCAL ; - /* - * Look for a possible destination in table - */ - index= HASH_FN( eh->ether_dhost ); - dst = bdg_table[index].name; - if ( dst && BDG_MATCH( bdg_table[index].etheraddr, eh->ether_dhost) ) - return dst ; - else - return BDG_UNKNOWN ; -} - -#endif /* KERNEL */ - -#endif /* _NET_BRIDGE_H_ */ diff --git a/bsd/net/bridgestp.c b/bsd/net/bridgestp.c new file mode 100644 index 000000000..1c895828e --- /dev/null +++ b/bsd/net/bridgestp.c @@ -0,0 +1,1153 @@ +/* + * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $fpwf: Revision 1.2 2007/05/17 03:38:46 rnewberry Exp $ */ +/* $NetBSD: bridgestp.c,v 1.10 2006/11/16 01:33:40 christos Exp $ */ + +/* + * Copyright (c) 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp + */ + +/* + * Implementation of the spanning tree protocol as defined in + * ISO/IEC Final DIS 15802-3 (IEEE P802.1D/D17), May 25, 1998. + * (In English: IEEE 802.1D, Draft 17, 1998) + */ + +/* $NetBSD: if_bridgevar.h,v 1.8 2005/12/10 23:21:38 elad Exp $ */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* BPDU message types */ +#define BSTP_MSGTYPE_CFG 0x00 /* Configuration */ +#define BSTP_MSGTYPE_TCN 0x80 /* Topology chg notification */ + +/* BPDU flags */ +#define BSTP_FLAG_TC 0x01 /* Topology change */ +#define BSTP_FLAG_TCA 0x80 /* Topology change ack */ + +#define BSTP_MESSAGE_AGE_INCR (1 * 256) /* in 256ths of a second */ +#define BSTP_TICK_VAL (1 * 256) /* in 256ths of a second */ + +/* + * Because BPDU's do not make nicely aligned structures, two different + * declarations are used: bstp_?bpdu (wire representation, packed) and + * bstp_*_unit (internal, nicely aligned version). + */ + +/* configuration bridge protocol data unit */ +struct bstp_cbpdu { + uint8_t cbu_dsap; /* LLC: destination sap */ + uint8_t cbu_ssap; /* LLC: source sap */ + uint8_t cbu_ctl; /* LLC: control */ + uint16_t cbu_protoid; /* protocol id */ + uint8_t cbu_protover; /* protocol version */ + uint8_t cbu_bpdutype; /* message type */ + uint8_t cbu_flags; /* flags (below) */ + + /* root id */ + uint16_t cbu_rootpri; /* root priority */ + uint8_t cbu_rootaddr[6]; /* root address */ + + uint32_t cbu_rootpathcost; /* root path cost */ + + /* bridge id */ + uint16_t cbu_bridgepri; /* bridge priority */ + uint8_t cbu_bridgeaddr[6]; /* bridge address */ + + uint16_t cbu_portid; /* port id */ + uint16_t cbu_messageage; /* current message age */ + uint16_t cbu_maxage; /* maximum age */ + uint16_t cbu_hellotime; /* hello time */ + uint16_t cbu_forwarddelay; /* forwarding delay */ +} __attribute__((__packed__)); + +/* topology change notification bridge protocol data unit */ +struct bstp_tbpdu { + uint8_t tbu_dsap; /* LLC: destination sap */ + uint8_t tbu_ssap; /* LLC: source sap */ + uint8_t tbu_ctl; /* LLC: control */ + uint16_t tbu_protoid; /* protocol id */ + uint8_t tbu_protover; /* protocol version */ + uint8_t tbu_bpdutype; /* message type */ +} __attribute__((__packed__)); + +const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +void bstp_initialize_port(struct bridge_softc *, struct bridge_iflist *); +void bstp_ifupdstatus(struct bridge_softc *, struct bridge_iflist *); +void bstp_enable_port(struct bridge_softc *, struct bridge_iflist *); +void bstp_disable_port(struct bridge_softc *, struct bridge_iflist *); +void bstp_enable_change_detection(struct bridge_iflist *); +void bstp_disable_change_detection(struct bridge_iflist *); +int bstp_root_bridge(struct bridge_softc *sc); +int bstp_supersedes_port_info(struct bridge_softc *, + struct bridge_iflist *, struct bstp_config_unit *); +int bstp_designated_port(struct bridge_softc *, struct bridge_iflist *); +int bstp_designated_for_some_port(struct bridge_softc *); +void bstp_transmit_config(struct bridge_softc *, struct bridge_iflist *); +void bstp_transmit_tcn(struct bridge_softc *); +void bstp_received_config_bpdu(struct bridge_softc *, + struct bridge_iflist *, struct bstp_config_unit *); +void bstp_received_tcn_bpdu(struct bridge_softc *, struct bridge_iflist *, + struct bstp_tcn_unit *); +void bstp_record_config_information(struct bridge_softc *, + struct bridge_iflist *, struct bstp_config_unit *); +void bstp_record_config_timeout_values(struct bridge_softc *, + struct bstp_config_unit *); +void bstp_config_bpdu_generation(struct bridge_softc *); +void bstp_send_config_bpdu(struct bridge_softc *, struct bridge_iflist *, + struct bstp_config_unit *); +void bstp_configuration_update(struct bridge_softc *); +void bstp_root_selection(struct bridge_softc *); +void bstp_designated_port_selection(struct bridge_softc *); +void bstp_become_designated_port(struct bridge_softc *, + struct bridge_iflist *); +void bstp_port_state_selection(struct bridge_softc *); +void bstp_make_forwarding(struct bridge_softc *, struct bridge_iflist *); +void bstp_make_blocking(struct bridge_softc *, struct bridge_iflist *); +void bstp_set_port_state(struct bridge_iflist *, uint8_t); +void bstp_set_bridge_priority(struct bridge_softc *, uint64_t); +void bstp_set_port_priority(struct bridge_softc *, struct bridge_iflist *, + uint16_t); +void bstp_set_path_cost(struct bridge_softc *, struct bridge_iflist *, + uint32_t); +void bstp_topology_change_detection(struct bridge_softc *); +void bstp_topology_change_acknowledged(struct bridge_softc *); +void bstp_acknowledge_topology_change(struct bridge_softc *, + struct bridge_iflist *); + +void bstp_tick(void *); +void bstp_timer_start(struct bridge_timer *, uint16_t); +void bstp_timer_stop(struct bridge_timer *); +int bstp_timer_expired(struct bridge_timer *, uint16_t); + +void bstp_hold_timer_expiry(struct bridge_softc *, struct bridge_iflist *); +void bstp_message_age_timer_expiry(struct bridge_softc *, + struct bridge_iflist *); +void bstp_forward_delay_timer_expiry(struct bridge_softc *, + struct bridge_iflist *); +void bstp_topology_change_timer_expiry(struct bridge_softc *); +void bstp_tcn_timer_expiry(struct bridge_softc *); +void bstp_hello_timer_expiry(struct bridge_softc *); + +void +bstp_transmit_config(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + if (bif->bif_hold_timer.active) { + bif->bif_config_pending = 1; + return; + } + + bif->bif_config_bpdu.cu_message_type = BSTP_MSGTYPE_CFG; + bif->bif_config_bpdu.cu_rootid = sc->sc_designated_root; + bif->bif_config_bpdu.cu_root_path_cost = sc->sc_root_path_cost; + bif->bif_config_bpdu.cu_bridge_id = sc->sc_bridge_id; + bif->bif_config_bpdu.cu_port_id = bif->bif_port_id; + + if (bstp_root_bridge(sc)) + bif->bif_config_bpdu.cu_message_age = 0; + else + bif->bif_config_bpdu.cu_message_age = + sc->sc_root_port->bif_message_age_timer.value + + BSTP_MESSAGE_AGE_INCR; + + bif->bif_config_bpdu.cu_max_age = sc->sc_max_age; + bif->bif_config_bpdu.cu_hello_time = sc->sc_hello_time; + bif->bif_config_bpdu.cu_forward_delay = sc->sc_forward_delay; + bif->bif_config_bpdu.cu_topology_change_acknowledgment + = bif->bif_topology_change_acknowledge; + bif->bif_config_bpdu.cu_topology_change = sc->sc_topology_change; + + if (bif->bif_config_bpdu.cu_message_age < sc->sc_max_age) { + bif->bif_topology_change_acknowledge = 0; + bif->bif_config_pending = 0; + bstp_send_config_bpdu(sc, bif, &bif->bif_config_bpdu); + bstp_timer_start(&bif->bif_hold_timer, 0); + } +} + +void +bstp_send_config_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif, + struct bstp_config_unit *cu) +{ + struct ifnet *ifp; + struct mbuf *m; + struct ether_header *eh; + struct bstp_cbpdu bpdu; + + ifp = bif->bif_ifp; + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + eh = mtod(m, struct ether_header *); + + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); + m->m_len = m->m_pkthdr.len; + + bpdu.cbu_ssap = bpdu.cbu_dsap = LLC_8021D_LSAP; + bpdu.cbu_ctl = LLC_UI; + bpdu.cbu_protoid = htons(0); + bpdu.cbu_protover = 0; + bpdu.cbu_bpdutype = cu->cu_message_type; + bpdu.cbu_flags = (cu->cu_topology_change ? BSTP_FLAG_TC : 0) | + (cu->cu_topology_change_acknowledgment ? BSTP_FLAG_TCA : 0); + + bpdu.cbu_rootpri = htons(cu->cu_rootid >> 48); + bpdu.cbu_rootaddr[0] = cu->cu_rootid >> 40; + bpdu.cbu_rootaddr[1] = cu->cu_rootid >> 32; + bpdu.cbu_rootaddr[2] = cu->cu_rootid >> 24; + bpdu.cbu_rootaddr[3] = cu->cu_rootid >> 16; + bpdu.cbu_rootaddr[4] = cu->cu_rootid >> 8; + bpdu.cbu_rootaddr[5] = cu->cu_rootid >> 0; + + bpdu.cbu_rootpathcost = htonl(cu->cu_root_path_cost); + + bpdu.cbu_bridgepri = htons(cu->cu_rootid >> 48); + bpdu.cbu_bridgeaddr[0] = cu->cu_rootid >> 40; + bpdu.cbu_bridgeaddr[1] = cu->cu_rootid >> 32; + bpdu.cbu_bridgeaddr[2] = cu->cu_rootid >> 24; + bpdu.cbu_bridgeaddr[3] = cu->cu_rootid >> 16; + bpdu.cbu_bridgeaddr[4] = cu->cu_rootid >> 8; + bpdu.cbu_bridgeaddr[5] = cu->cu_rootid >> 0; + + bpdu.cbu_portid = htons(cu->cu_port_id); + bpdu.cbu_messageage = htons(cu->cu_message_age); + bpdu.cbu_maxage = htons(cu->cu_max_age); + bpdu.cbu_hellotime = htons(cu->cu_hello_time); + bpdu.cbu_forwarddelay = htons(cu->cu_forward_delay); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + eh->ether_type = htons(sizeof(bpdu)); + + memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); + + bridge_enqueue(sc, ifp, m); // APPLE MODIFICATION - no flags param +} + +int +bstp_root_bridge(struct bridge_softc *sc) +{ + return (sc->sc_designated_root == sc->sc_bridge_id); +} + +int +bstp_supersedes_port_info(struct bridge_softc *sc, struct bridge_iflist *bif, + struct bstp_config_unit *cu) +{ + if (cu->cu_rootid < bif->bif_designated_root) + return (1); + if (cu->cu_rootid > bif->bif_designated_root) + return (0); + + if (cu->cu_root_path_cost < bif->bif_designated_cost) + return (1); + if (cu->cu_root_path_cost > bif->bif_designated_cost) + return (0); + + if (cu->cu_bridge_id < bif->bif_designated_bridge) + return (1); + if (cu->cu_bridge_id > bif->bif_designated_bridge) + return (0); + + if (sc->sc_bridge_id != cu->cu_bridge_id) + return (1); + if (cu->cu_port_id <= bif->bif_designated_port) + return (1); + return (0); +} + +void +bstp_record_config_information(__unused struct bridge_softc *sc, + struct bridge_iflist *bif, struct bstp_config_unit *cu) +{ + bif->bif_designated_root = cu->cu_rootid; + bif->bif_designated_cost = cu->cu_root_path_cost; + bif->bif_designated_bridge = cu->cu_bridge_id; + bif->bif_designated_port = cu->cu_port_id; + bstp_timer_start(&bif->bif_message_age_timer, cu->cu_message_age); +} + +void +bstp_record_config_timeout_values(struct bridge_softc *sc, + struct bstp_config_unit *config) +{ + sc->sc_max_age = config->cu_max_age; + sc->sc_hello_time = config->cu_hello_time; + sc->sc_forward_delay = config->cu_forward_delay; + sc->sc_topology_change = config->cu_topology_change; +} + +void +bstp_config_bpdu_generation(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_designated_port(sc, bif) && + (bif->bif_state != BSTP_IFSTATE_DISABLED)) + bstp_transmit_config(sc, bif); + } +} + +int +bstp_designated_port(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + return ((bif->bif_designated_bridge == sc->sc_bridge_id) + && (bif->bif_designated_port == bif->bif_port_id)); +} + +void +bstp_transmit_tcn(struct bridge_softc *sc) +{ + struct bstp_tbpdu bpdu; + struct bridge_iflist *bif = sc->sc_root_port; + struct ifnet *ifp; + struct ether_header *eh; + struct mbuf *m; + + KASSERT(bif != NULL, "bstp_transmit_tcn bif NULL"); + ifp = bif->bif_ifp; + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); + m->m_len = m->m_pkthdr.len; + + eh = mtod(m, struct ether_header *); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + eh->ether_type = htons(sizeof(bpdu)); + + bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP; + bpdu.tbu_ctl = LLC_UI; + bpdu.tbu_protoid = 0; + bpdu.tbu_protover = 0; + bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN; + + memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); + + bridge_enqueue(sc, ifp, m); // APPLE MODIFICATION - no flags param +} + +void +bstp_configuration_update(struct bridge_softc *sc) +{ + bstp_root_selection(sc); + bstp_designated_port_selection(sc); +} + +void +bstp_root_selection(struct bridge_softc *sc) +{ + struct bridge_iflist *root_port = NULL, *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_designated_port(sc, bif)) + continue; + if (bif->bif_state == BSTP_IFSTATE_DISABLED) + continue; + if (bif->bif_designated_root >= sc->sc_bridge_id) + continue; + if (root_port == NULL) + goto set_port; + + if (bif->bif_designated_root < root_port->bif_designated_root) + goto set_port; + if (bif->bif_designated_root > root_port->bif_designated_root) + continue; + + if ((bif->bif_designated_cost + bif->bif_path_cost) < + (root_port->bif_designated_cost + root_port->bif_path_cost)) + goto set_port; + if ((bif->bif_designated_cost + bif->bif_path_cost) > + (root_port->bif_designated_cost + root_port->bif_path_cost)) + continue; + + if (bif->bif_designated_bridge < + root_port->bif_designated_bridge) + goto set_port; + if (bif->bif_designated_bridge > + root_port->bif_designated_bridge) + continue; + + if (bif->bif_designated_port < root_port->bif_designated_port) + goto set_port; + if (bif->bif_designated_port > root_port->bif_designated_port) + continue; + + if (bif->bif_port_id >= root_port->bif_port_id) + continue; +set_port: + root_port = bif; + } + + sc->sc_root_port = root_port; + if (root_port == NULL) { + sc->sc_designated_root = sc->sc_bridge_id; + sc->sc_root_path_cost = 0; + } else { + sc->sc_designated_root = root_port->bif_designated_root; + sc->sc_root_path_cost = root_port->bif_designated_cost + + root_port->bif_path_cost; + } +} + +void +bstp_designated_port_selection(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_designated_port(sc, bif)) + goto designated; + if (bif->bif_designated_root != sc->sc_designated_root) + goto designated; + + if (sc->sc_root_path_cost < bif->bif_designated_cost) + goto designated; + if (sc->sc_root_path_cost > bif->bif_designated_cost) + continue; + + if (sc->sc_bridge_id < bif->bif_designated_bridge) + goto designated; + if (sc->sc_bridge_id > bif->bif_designated_bridge) + continue; + + if (bif->bif_port_id > bif->bif_designated_port) + continue; +designated: + bstp_become_designated_port(sc, bif); + } +} + +void +bstp_become_designated_port(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + bif->bif_designated_root = sc->sc_designated_root; + bif->bif_designated_cost = sc->sc_root_path_cost; + bif->bif_designated_bridge = sc->sc_bridge_id; + bif->bif_designated_port = bif->bif_port_id; +} + +void +bstp_port_state_selection(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bif == sc->sc_root_port) { + bif->bif_config_pending = 0; + bif->bif_topology_change_acknowledge = 0; + bstp_make_forwarding(sc, bif); + } else if (bstp_designated_port(sc, bif)) { + bstp_timer_stop(&bif->bif_message_age_timer); + bstp_make_forwarding(sc, bif); + } else { + bif->bif_config_pending = 0; + bif->bif_topology_change_acknowledge = 0; + bstp_make_blocking(sc, bif); + } + } +} + +void +bstp_make_forwarding(__unused struct bridge_softc *sc, + struct bridge_iflist *bif) +{ + if (bif->bif_state == BSTP_IFSTATE_BLOCKING) { + bstp_set_port_state(bif, BSTP_IFSTATE_LISTENING); + bstp_timer_start(&bif->bif_forward_delay_timer, 0); + } +} + +void +bstp_make_blocking(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + if ((bif->bif_state != BSTP_IFSTATE_DISABLED) && + (bif->bif_state != BSTP_IFSTATE_BLOCKING)) { + if ((bif->bif_state == BSTP_IFSTATE_FORWARDING) || + (bif->bif_state == BSTP_IFSTATE_LEARNING)) { + if (bif->bif_change_detection_enabled) { + bstp_topology_change_detection(sc); + } + } + bstp_set_port_state(bif, BSTP_IFSTATE_BLOCKING); + bstp_timer_stop(&bif->bif_forward_delay_timer); + } +} + +void +bstp_set_port_state(struct bridge_iflist *bif, uint8_t state) +{ + bif->bif_state = state; +} + +void +bstp_topology_change_detection(struct bridge_softc *sc) +{ + if (bstp_root_bridge(sc)) { + sc->sc_topology_change = 1; + bstp_timer_start(&sc->sc_topology_change_timer, 0); + } else if (!sc->sc_topology_change_detected) { + bstp_transmit_tcn(sc); + bstp_timer_start(&sc->sc_tcn_timer, 0); + } + sc->sc_topology_change_detected = 1; +} + +void +bstp_topology_change_acknowledged(struct bridge_softc *sc) +{ + sc->sc_topology_change_detected = 0; + bstp_timer_stop(&sc->sc_tcn_timer); +} + +void +bstp_acknowledge_topology_change(struct bridge_softc *sc, + struct bridge_iflist *bif) +{ + bif->bif_topology_change_acknowledge = 1; + bstp_transmit_config(sc, bif); +} + +__private_extern__ struct mbuf * +bstp_input(struct bridge_softc *sc, struct ifnet *ifp, struct mbuf *m) +{ + struct bridge_iflist *bif = NULL; + struct ether_header *eh; + struct bstp_tbpdu tpdu; + struct bstp_cbpdu cpdu; + struct bstp_config_unit cu; + struct bstp_tcn_unit tu; + uint16_t len; + + eh = mtod(m, struct ether_header *); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bif->bif_ifp == ifp) + break; + } + if (bif == NULL) + goto out; + + len = ntohs(eh->ether_type); + if (len < sizeof(tpdu)) + goto out; + + m_adj(m, ETHER_HDR_LEN); + + if (m->m_pkthdr.len > len) + m_adj(m, len - m->m_pkthdr.len); + if ((size_t)m->m_len < sizeof(tpdu) && + (m = m_pullup(m, sizeof(tpdu))) == NULL) + goto out; + + memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu)); + + if (tpdu.tbu_dsap != LLC_8021D_LSAP || + tpdu.tbu_ssap != LLC_8021D_LSAP || + tpdu.tbu_ctl != LLC_UI) + goto out; + if (tpdu.tbu_protoid != 0 || tpdu.tbu_protover != 0) + goto out; + + switch (tpdu.tbu_bpdutype) { + case BSTP_MSGTYPE_TCN: + tu.tu_message_type = tpdu.tbu_bpdutype; + bstp_received_tcn_bpdu(sc, bif, &tu); + break; + case BSTP_MSGTYPE_CFG: + if ((size_t)m->m_len < sizeof(cpdu) && + (m = m_pullup(m, sizeof(cpdu))) == NULL) + goto out; + memcpy(&cpdu, mtod(m, caddr_t), sizeof(cpdu)); + + cu.cu_rootid = + (((uint64_t)ntohs(cpdu.cbu_rootpri)) << 48) | + (((uint64_t)cpdu.cbu_rootaddr[0]) << 40) | + (((uint64_t)cpdu.cbu_rootaddr[1]) << 32) | + (((uint64_t)cpdu.cbu_rootaddr[2]) << 24) | + (((uint64_t)cpdu.cbu_rootaddr[3]) << 16) | + (((uint64_t)cpdu.cbu_rootaddr[4]) << 8) | + (((uint64_t)cpdu.cbu_rootaddr[5]) << 0); + + cu.cu_bridge_id = + (((uint64_t)ntohs(cpdu.cbu_bridgepri)) << 48) | + (((uint64_t)cpdu.cbu_bridgeaddr[0]) << 40) | + (((uint64_t)cpdu.cbu_bridgeaddr[1]) << 32) | + (((uint64_t)cpdu.cbu_bridgeaddr[2]) << 24) | + (((uint64_t)cpdu.cbu_bridgeaddr[3]) << 16) | + (((uint64_t)cpdu.cbu_bridgeaddr[4]) << 8) | + (((uint64_t)cpdu.cbu_bridgeaddr[5]) << 0); + + cu.cu_root_path_cost = ntohl(cpdu.cbu_rootpathcost); + cu.cu_message_age = ntohs(cpdu.cbu_messageage); + cu.cu_max_age = ntohs(cpdu.cbu_maxage); + cu.cu_hello_time = ntohs(cpdu.cbu_hellotime); + cu.cu_forward_delay = ntohs(cpdu.cbu_forwarddelay); + cu.cu_port_id = ntohs(cpdu.cbu_portid); + cu.cu_message_type = cpdu.cbu_bpdutype; + cu.cu_topology_change_acknowledgment = + (cpdu.cbu_flags & BSTP_FLAG_TCA) ? 1 : 0; + cu.cu_topology_change = + (cpdu.cbu_flags & BSTP_FLAG_TC) ? 1 : 0; + bstp_received_config_bpdu(sc, bif, &cu); + break; + default: + goto out; + } + + out: + if (m) + m_freem(m); + return (NULL); +} + +void +bstp_received_config_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif, + struct bstp_config_unit *cu) +{ + int root; + + root = bstp_root_bridge(sc); + + if (bif->bif_state != BSTP_IFSTATE_DISABLED) { + if (bstp_supersedes_port_info(sc, bif, cu)) { + bstp_record_config_information(sc, bif, cu); + bstp_configuration_update(sc); + bstp_port_state_selection(sc); + + if ((bstp_root_bridge(sc) == 0) && root) { + bstp_timer_stop(&sc->sc_hello_timer); + + if (sc->sc_topology_change_detected) { + bstp_timer_stop( + &sc->sc_topology_change_timer); + bstp_transmit_tcn(sc); + bstp_timer_start(&sc->sc_tcn_timer, 0); + } + } + + if (bif == sc->sc_root_port) { + bstp_record_config_timeout_values(sc, cu); + bstp_config_bpdu_generation(sc); + + if (cu->cu_topology_change_acknowledgment) + bstp_topology_change_acknowledged(sc); + } + } else if (bstp_designated_port(sc, bif)) + bstp_transmit_config(sc, bif); + } +} + +void +bstp_received_tcn_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif, + __unused struct bstp_tcn_unit *tcn) +{ + if (bif->bif_state != BSTP_IFSTATE_DISABLED && + bstp_designated_port(sc, bif)) { + bstp_topology_change_detection(sc); + bstp_acknowledge_topology_change(sc, bif); + } +} + +void +bstp_hello_timer_expiry(struct bridge_softc *sc) +{ + bstp_config_bpdu_generation(sc); + bstp_timer_start(&sc->sc_hello_timer, 0); +} + +void +bstp_message_age_timer_expiry(struct bridge_softc *sc, + struct bridge_iflist *bif) +{ + int root; + + root = bstp_root_bridge(sc); + bstp_become_designated_port(sc, bif); + bstp_configuration_update(sc); + bstp_port_state_selection(sc); + + if ((bstp_root_bridge(sc)) && (root == 0)) { + sc->sc_max_age = sc->sc_bridge_max_age; + sc->sc_hello_time = sc->sc_bridge_hello_time; + sc->sc_forward_delay = sc->sc_bridge_forward_delay; + + bstp_topology_change_detection(sc); + bstp_timer_stop(&sc->sc_tcn_timer); + bstp_config_bpdu_generation(sc); + bstp_timer_start(&sc->sc_hello_timer, 0); + } +} + +void +bstp_forward_delay_timer_expiry(struct bridge_softc *sc, + struct bridge_iflist *bif) +{ + if (bif->bif_state == BSTP_IFSTATE_LISTENING) { + bstp_set_port_state(bif, BSTP_IFSTATE_LEARNING); + bstp_timer_start(&bif->bif_forward_delay_timer, 0); + } else if (bif->bif_state == BSTP_IFSTATE_LEARNING) { + bstp_set_port_state(bif, BSTP_IFSTATE_FORWARDING); + if (bstp_designated_for_some_port(sc) && + bif->bif_change_detection_enabled) + bstp_topology_change_detection(sc); + } +} + +int +bstp_designated_for_some_port(struct bridge_softc *sc) +{ + + struct bridge_iflist *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bif->bif_designated_bridge == sc->sc_bridge_id) + return (1); + } + return (0); +} + +void +bstp_tcn_timer_expiry(struct bridge_softc *sc) +{ + bstp_transmit_tcn(sc); + bstp_timer_start(&sc->sc_tcn_timer, 0); +} + +void +bstp_topology_change_timer_expiry(struct bridge_softc *sc) +{ + sc->sc_topology_change_detected = 0; + sc->sc_topology_change = 0; +} + +void +bstp_hold_timer_expiry(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + if (bif->bif_config_pending) + bstp_transmit_config(sc, bif); +} + +__private_extern__ void +bstp_initialization(struct bridge_softc *sc) +{ + struct bridge_iflist *bif, *mif; + struct timespec ts; + unsigned char *lladdr; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + mif = NULL; + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bif->bif_ifp->if_type != IFT_ETHER) + continue; + bif->bif_port_id = (bif->bif_priority << 8) | + (bif->bif_ifp->if_index & 0xff); + + if (mif == NULL) { + mif = bif; + continue; + } + if (memcmp(ifnet_lladdr(bif->bif_ifp), + ifnet_lladdr(mif->bif_ifp), ETHER_ADDR_LEN) < 0) { + mif = bif; + continue; + } + } + if (mif == NULL) { + bstp_stop(sc); + return; + } + + lladdr = ifnet_lladdr(mif->bif_ifp); + sc->sc_bridge_id = + (((uint64_t)sc->sc_bridge_priority) << 48) | + (((uint64_t)lladdr[0]) << 40) | + (((uint64_t)lladdr[1]) << 32) | + (lladdr[2] << 24) | + (lladdr[3] << 16) | + (lladdr[4] << 8) | + (lladdr[5]); + + sc->sc_designated_root = sc->sc_bridge_id; + sc->sc_root_path_cost = 0; + sc->sc_root_port = NULL; + + sc->sc_max_age = sc->sc_bridge_max_age; + sc->sc_hello_time = sc->sc_bridge_hello_time; + sc->sc_forward_delay = sc->sc_bridge_forward_delay; + sc->sc_topology_change_detected = 0; + sc->sc_topology_change = 0; + bstp_timer_stop(&sc->sc_tcn_timer); + bstp_timer_stop(&sc->sc_topology_change_timer); + + bsd_untimeout(bstp_tick, sc); + ts.tv_sec = 1; + ts.tv_nsec = 0; + bsd_timeout(bstp_tick, sc, &ts); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_flags & IFBIF_STP) + bstp_enable_port(sc, bif); + else + bstp_disable_port(sc, bif); + } + + bstp_port_state_selection(sc); + bstp_config_bpdu_generation(sc); + bstp_timer_start(&sc->sc_hello_timer, 0); +} + +__private_extern__ void +bstp_stop(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + bstp_set_port_state(bif, BSTP_IFSTATE_DISABLED); + bstp_timer_stop(&bif->bif_hold_timer); + bstp_timer_stop(&bif->bif_message_age_timer); + bstp_timer_stop(&bif->bif_forward_delay_timer); + } + + bsd_untimeout(bstp_tick, sc); + + bstp_timer_stop(&sc->sc_topology_change_timer); + bstp_timer_stop(&sc->sc_tcn_timer); + bstp_timer_stop(&sc->sc_hello_timer); + +} + +void +bstp_initialize_port(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + bstp_become_designated_port(sc, bif); + bstp_set_port_state(bif, BSTP_IFSTATE_BLOCKING); + bif->bif_topology_change_acknowledge = 0; + bif->bif_config_pending = 0; + bif->bif_change_detection_enabled = 1; + bstp_timer_stop(&bif->bif_message_age_timer); + bstp_timer_stop(&bif->bif_forward_delay_timer); + bstp_timer_stop(&bif->bif_hold_timer); +} + +void +bstp_enable_port(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + bstp_initialize_port(sc, bif); + bstp_port_state_selection(sc); +} + +void +bstp_disable_port(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + int root; + + root = bstp_root_bridge(sc); + bstp_become_designated_port(sc, bif); + bstp_set_port_state(bif, BSTP_IFSTATE_DISABLED); + bif->bif_topology_change_acknowledge = 0; + bif->bif_config_pending = 0; + bstp_timer_stop(&bif->bif_message_age_timer); + bstp_timer_stop(&bif->bif_forward_delay_timer); + bstp_configuration_update(sc); + bstp_port_state_selection(sc); + + if (bstp_root_bridge(sc) && (root == 0)) { + sc->sc_max_age = sc->sc_bridge_max_age; + sc->sc_hello_time = sc->sc_bridge_hello_time; + sc->sc_forward_delay = sc->sc_bridge_forward_delay; + + bstp_topology_change_detection(sc); + bstp_timer_stop(&sc->sc_tcn_timer); + bstp_config_bpdu_generation(sc); + bstp_timer_start(&sc->sc_hello_timer, 0); + } +} + +void +bstp_set_bridge_priority(struct bridge_softc *sc, uint64_t new_bridge_id) +{ + struct bridge_iflist *bif; + int root; + + root = bstp_root_bridge(sc); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_designated_port(sc, bif)) + bif->bif_designated_bridge = new_bridge_id; + } + + sc->sc_bridge_id = new_bridge_id; + + bstp_configuration_update(sc); + bstp_port_state_selection(sc); + + if (bstp_root_bridge(sc) && (root == 0)) { + sc->sc_max_age = sc->sc_bridge_max_age; + sc->sc_hello_time = sc->sc_bridge_hello_time; + sc->sc_forward_delay = sc->sc_bridge_forward_delay; + + bstp_topology_change_detection(sc); + bstp_timer_stop(&sc->sc_tcn_timer); + bstp_config_bpdu_generation(sc); + bstp_timer_start(&sc->sc_hello_timer, 0); + } +} + +void +bstp_set_port_priority(struct bridge_softc *sc, struct bridge_iflist *bif, + uint16_t new_port_id) +{ + if (bstp_designated_port(sc, bif)) + bif->bif_designated_port = new_port_id; + + bif->bif_port_id = new_port_id; + + if ((sc->sc_bridge_id == bif->bif_designated_bridge) && + (bif->bif_port_id < bif->bif_designated_port)) { + bstp_become_designated_port(sc, bif); + bstp_port_state_selection(sc); + } +} + +void +bstp_set_path_cost(struct bridge_softc *sc, struct bridge_iflist *bif, + uint32_t path_cost) +{ + bif->bif_path_cost = path_cost; + bstp_configuration_update(sc); + bstp_port_state_selection(sc); +} + +void +bstp_enable_change_detection(struct bridge_iflist *bif) +{ + bif->bif_change_detection_enabled = 1; +} + +void +bstp_disable_change_detection(struct bridge_iflist *bif) +{ + bif->bif_change_detection_enabled = 0; +} + +void +bstp_ifupdstatus(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + struct ifnet *ifp = bif->bif_ifp; + struct ifmediareq ifmr; + + if ((ifnet_flags(ifp) & IFF_UP)) { + bzero(&ifmr, sizeof(ifmr)); + if (ifnet_ioctl(ifp, 0, SIOCGIFMEDIA, &ifmr) == 0) { + // enable the port when the link is up, or its state is unknown + if ((ifmr.ifm_status & IFM_ACTIVE) || !(ifmr.ifm_status & IFM_AVALID)) { + if (bif->bif_state == BSTP_IFSTATE_DISABLED) + bstp_enable_port(sc, bif); + } else { + if (bif->bif_state != BSTP_IFSTATE_DISABLED) + bstp_disable_port(sc, bif); + } + } + return; + } + + if (bif->bif_state != BSTP_IFSTATE_DISABLED) + bstp_disable_port(sc, bif); +} + +void +bstp_tick(void *arg) +{ + struct bridge_softc *sc = arg; + struct bridge_iflist *bif; + struct timespec ts; + + lck_mtx_lock(sc->sc_mtx); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + /* + * XXX This can cause a lag in "link does away" + * XXX and "spanning tree gets updated". We need + * XXX come sort of callback from the link state + * XXX update code to kick spanning tree. + * XXX --thorpej@NetBSD.org + */ + bstp_ifupdstatus(sc, bif); + } + + if (bstp_timer_expired(&sc->sc_hello_timer, sc->sc_hello_time)) + bstp_hello_timer_expiry(sc); + + if (bstp_timer_expired(&sc->sc_tcn_timer, sc->sc_bridge_hello_time)) + bstp_tcn_timer_expiry(sc); + + if (bstp_timer_expired(&sc->sc_topology_change_timer, + sc->sc_topology_change_time)) + bstp_topology_change_timer_expiry(sc); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_timer_expired(&bif->bif_message_age_timer, + sc->sc_max_age)) + bstp_message_age_timer_expiry(sc, bif); + } + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + if (bstp_timer_expired(&bif->bif_forward_delay_timer, + sc->sc_forward_delay)) + bstp_forward_delay_timer_expiry(sc, bif); + + if (bstp_timer_expired(&bif->bif_hold_timer, + sc->sc_hold_time)) + bstp_hold_timer_expiry(sc, bif); + } + + lck_mtx_unlock(sc->sc_mtx); + + /* APPLE MODIFICATION - bridge changes */ + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) { + ts.tv_sec = 1; + ts.tv_nsec = 0; + bsd_timeout(bstp_tick, sc, &ts); + } +} + +void +bstp_timer_start(struct bridge_timer *t, uint16_t v) +{ + t->value = v; + t->active = 1; +} + +void +bstp_timer_stop(struct bridge_timer *t) +{ + t->value = 0; + t->active = 0; +} + +int +bstp_timer_expired(struct bridge_timer *t, uint16_t v) +{ + if (t->active == 0) + return (0); + t->value += BSTP_TICK_VAL; + if (t->value >= v) { + bstp_timer_stop(t); + return (1); + } + return (0); + +} diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index e82208b12..254d94b77 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Inc. All rights reserved. + * Copyright (c) 1999-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1003,6 +1003,14 @@ dlil_interface_filters_input(struct ifnet * ifp, struct mbuf * * m_p, } } } + + /* + * Strip away M_PROTO1 bit prior to sending packet up the stack as + * it is meant to be local to a subsystem -- if_bridge for M_PROTO1 + */ + if (*m_p != NULL) + (*m_p)->m_flags &= ~M_PROTO1; + return (0); } @@ -1350,28 +1358,6 @@ dlil_output_list( } } -#if BRIDGE - /* !!!LOCKING!!! - * - * Need to consider how to handle this. - * Also note that return should be a goto cleanup - */ - broken-locking - if (do_bridge) { - struct mbuf *m0 = m; - struct ether_header *eh = mtod(m, struct ether_header *); - - if (m->m_pkthdr.rcvif) - m->m_pkthdr.rcvif = NULL; - ifp = bridge_dst_lookup(eh); - bdg_forward(&m0, ifp); - if (m0) - m_freem(m0); - - return 0 - should be goto cleanup? - } -#endif - /* * Let interface filters (if any) do their thing ... */ @@ -1389,6 +1375,11 @@ dlil_output_list( } } } + /* + * Strip away M_PROTO1 bit prior to sending packet to the driver + * as this field may be used by the driver + */ + m->m_flags &= ~M_PROTO1; /* * Finally, call the driver. @@ -1559,28 +1550,6 @@ dlil_output( m->m_pkthdr.rcvif = NULL; } -#if BRIDGE - /* !!!LOCKING!!! - * - * Need to consider how to handle this. - * Also note that return should be a goto cleanup - */ - broken-locking - if (do_bridge) { - struct mbuf *m0 = m; - struct ether_header *eh = mtod(m, struct ether_header *); - - if (m->m_pkthdr.rcvif) - m->m_pkthdr.rcvif = NULL; - ifp = bridge_dst_lookup(eh); - bdg_forward(&m0, ifp); - if (m0) - m_freem(m0); - - return 0 - should be goto cleanup? - } -#endif - /* * Let interface filters (if any) do their thing ... */ @@ -1599,6 +1568,12 @@ dlil_output( } } + /* + * Strip away M_PROTO1 bit prior to sending packet to the driver + * as this field may be used by the driver + */ + m->m_flags &= ~M_PROTO1; + /* * If the underlying interface is not capable of handling a * packet whose data portion spans across physically disjoint diff --git a/bsd/net/ether_at_pr_module.c b/bsd/net/ether_at_pr_module.c index 9ae109b85..1adcbe27e 100644 --- a/bsd/net/ether_at_pr_module.c +++ b/bsd/net/ether_at_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000,2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,10 +88,6 @@ extern struct ifqueue atalkintrq; #endif -#if BRIDGE -#include -#endif - /* #include "vlan.h" */ #if NVLAN > 0 #include diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index 42e0a67a7..a6ec5b2c5 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,6 +98,9 @@ #include #include #include +#if IF_BRIDGE +#include +#endif #include @@ -113,10 +116,6 @@ extern struct ifqueue atalkintrq; #endif -#if BRIDGE -#include -#endif - #define memcpy(x,y,z) bcopy(y, x, z) @@ -636,6 +635,9 @@ __private_extern__ int ether_family_init(void) #if BOND bond_family_init(); #endif /* BOND */ +#if IF_BRIDGE + bridgeattach(0); +#endif done: diff --git a/bsd/net/ether_inet6_pr_module.c b/bsd/net/ether_inet6_pr_module.c index 52fd39229..371cccfd6 100644 --- a/bsd/net/ether_inet6_pr_module.c +++ b/bsd/net/ether_inet6_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,11 +101,6 @@ extern struct ifqueue pkintrq; #endif - -#if BRIDGE -#include -#endif - /* #include "vlan.h" */ #if NVLAN > 0 #include diff --git a/bsd/net/ether_inet_pr_module.c b/bsd/net/ether_inet_pr_module.c index 177631c4b..422866e73 100644 --- a/bsd/net/ether_inet_pr_module.c +++ b/bsd/net/ether_inet_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,10 +94,6 @@ #include -#if BRIDGE -#include -#endif - /* #include "vlan.h" */ #if NVLAN > 0 #include diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index efbf23c0f..00b7fa5fb 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000,2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,8 @@ struct ether_addr { #define ETHERTYPE_REVARP 0x8035 /* reverse Addr. resolution protocol */ #define ETHERTYPE_VLAN 0x8100 /* IEEE 802.1Q VLAN tagging */ #define ETHERTYPE_IPV6 0x86dd /* IPv6 */ +#define ETHERTYPE_PAE 0x888e /* EAPOL PAE/802.1x */ +#define ETHERTYPE_RSN_PREAUTH 0x88c7 /* 802.11i / RSN Pre-Authentication */ #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */ /* XXX - add more useful types here */ @@ -119,6 +121,9 @@ struct ether_addr *ether_aton(const char *); #ifdef BSD_KERNEL_PRIVATE extern u_char etherbroadcastaddr[ETHER_ADDR_LEN]; #endif + +#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ + #endif /* KERNEL_PRIVATE */ #ifndef KERNEL diff --git a/bsd/net/if.h b/bsd/net/if.h index 20f360037..229eb134f 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,6 +101,8 @@ #endif #ifdef KERNEL_PRIVATE +#define IF_MAXUNIT 0x7fff /* historical value */ + struct if_clonereq { int ifcr_total; /* total cloners (out) */ int ifcr_count; /* room for this many in user buffer */ @@ -406,6 +408,34 @@ struct ifmediareq32 { #pragma pack() #endif /* KERNEL_PRIVATE */ + +#pragma pack(4) +struct ifdrv { + char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + unsigned long ifd_cmd; + size_t ifd_len; + void *ifd_data; +}; +#pragma pack() + +#ifdef KERNEL_PRIVATE +#pragma pack(4) +struct ifdrv32 { + char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_int32_t ifd_cmd; + u_int32_t ifd_len; + user32_addr_t ifd_data; +}; + +struct ifdrv64 { + char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_int64_t ifd_cmd; + u_int64_t ifd_len; + user64_addr_t ifd_data; +}; +#pragma pack() +#endif /* KERNEL_PRIVATE */ + /* * Structure used to retrieve aux status data from interfaces. * Kernel suppliers to this interface should respect the formatting diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c new file mode 100644 index 000000000..acce8faa5 --- /dev/null +++ b/bsd/net/if_bridge.c @@ -0,0 +1,3847 @@ +/* + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $apfw: Revision 1.19 2008/10/24 02:34:06 cbzimmer Exp $ */ +/* $NetBSD: if_bridge.c,v 1.46 2006/11/23 04:07:07 rpaulo Exp $ */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp + */ + +/* + * Network interface bridge support. + * + * TODO: + * + * - Currently only supports Ethernet-like interfaces (Ethernet, + * 802.11, VLANs on Ethernet, etc.) Figure out a nice way + * to bridge other types of interfaces (FDDI-FDDI, and maybe + * consider heterogenous bridges). + */ + +#include +//_KERNEL_RCSID(0, "$NetBSD: if_bridge.c,v 1.46 2006/11/23 04:07:07 rpaulo Exp $"); + +//#include "opt_bridge_ipf.h" +//#include "opt_inet.h" +//#include "opt_pfil_hooks.h" +//#include "opt_wlan.h" /* APPLE MODIFICATION - Proxy STA support */ +//#include "bpfilter.h" +//#include "gif.h" // APPLE MODIFICATION - add gif support + +#define BRIDGE_DEBUG 0 + +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +#include +#include + +#include + +#include + +#if NBPFILTER > 0 +#include +#endif +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif + +#if BRIDGE_DEBUG +#define static __private_extern__ +#endif + +extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); + +/* + * Size of the route hash table. Must be a power of two. + */ +/* APPLE MODIFICATION - per Wasabi performance improvement, change the hash table size */ +#if 0 +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 1024 +#endif +#else +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 256 +#endif +#endif + +/* APPLE MODIFICATION - support for HW checksums */ +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +#include +#include +#endif + +#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) + +//#include "carp.h" +#if NCARP > 0 +#include +#include +#include +#endif + +/* + * Maximum number of addresses to cache. + */ +#ifndef BRIDGE_RTABLE_MAX +#define BRIDGE_RTABLE_MAX 100 +#endif + +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +/* + * Maximum (additional to maxcache) number of proxysta addresses to cache. + */ +#ifndef BRIDGE_RTABLE_MAX_PROXYSTA +#define BRIDGE_RTABLE_MAX_PROXYSTA 16 +#endif +#endif + +/* + * Spanning tree defaults. + */ +#define BSTP_DEFAULT_MAX_AGE (20 * 256) +#define BSTP_DEFAULT_HELLO_TIME (2 * 256) +#define BSTP_DEFAULT_FORWARD_DELAY (15 * 256) +#define BSTP_DEFAULT_HOLD_TIME (1 * 256) +#define BSTP_DEFAULT_BRIDGE_PRIORITY 0x8000 +#define BSTP_DEFAULT_PORT_PRIORITY 0x80 +#define BSTP_DEFAULT_PATH_COST 55 + +/* + * Timeout (in seconds) for entries learned dynamically. + */ +#ifndef BRIDGE_RTABLE_TIMEOUT +#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ +#endif + +/* + * Number of seconds between walks of the route list. + */ +#ifndef BRIDGE_RTABLE_PRUNE_PERIOD +#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) +#endif + +/* + * List of capabilities to mask on the member interface. + */ +#define BRIDGE_IFCAPS_MASK \ + (IFCAP_CSUM_IPv4_Tx | \ + IFCAP_CSUM_TCPv4_Tx | \ + IFCAP_CSUM_UDPv4_Tx | \ + IFCAP_CSUM_TCPv6_Tx | \ + IFCAP_CSUM_UDPv6_Tx) + + +int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; + +static zone_t bridge_rtnode_pool = NULL; + +static errno_t +bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + mbuf_t *data, char **frame_ptr); +static void +bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + const struct kev_msg *event_msg); +static void +bridge_iff_detached(void* cookie, __unused ifnet_t interface); + +static uint32_t +bridge_rthash(__unused struct bridge_softc *sc, const uint8_t *addr); + +static int bridge_clone_create(struct if_clone *, int); +static void bridge_clone_destroy(struct ifnet *); + +static errno_t bridge_ioctl(ifnet_t ifp, unsigned long cmd, void *data); +#if HAS_IF_CAP +static void bridge_mutecaps(struct bridge_iflist *, int); +#endif +static int bridge_init(struct ifnet *); +static void bridge_stop(struct ifnet *, int); + +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t +bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, mbuf_t *data); +static int bridge_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m); +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + +static errno_t bridge_start(struct ifnet *, mbuf_t); +static errno_t bridge_set_bpf_tap(ifnet_t ifn, bpf_tap_mode mode, bpf_packet_func bpf_callback); +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m); +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m); + +static void bridge_detach(ifnet_t ifp); + +static errno_t bridge_input(struct bridge_iflist *, struct ifnet *, struct mbuf *, void *frame_header); + +static void bridge_forward(struct bridge_softc *, struct mbuf *m); + +static void bridge_timer(void *); + +static void bridge_broadcast(struct bridge_softc *, struct ifnet *, + struct mbuf *, int); + +static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, + struct ifnet *, int, uint8_t); +static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *); +static void bridge_rttrim(struct bridge_softc *); +static void bridge_rtage(struct bridge_softc *); +static void bridge_rtflush(struct bridge_softc *, int); +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +static void bridge_rtdiscovery(struct bridge_softc *); +static void bridge_rtpurge(struct bridge_softc *, struct ifnet *); +#endif +static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *); + +static int bridge_rtable_init(struct bridge_softc *); +static void bridge_rtable_fini(struct bridge_softc *); + +static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, + const uint8_t *); +static int bridge_rtnode_insert(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtnode_destroy(struct bridge_softc *, + struct bridge_rtnode *); + +static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, + const char *name); +static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, + struct ifnet *ifp); +static void bridge_delete_member(struct bridge_softc *, + struct bridge_iflist *); + +static void bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp); + + +static int bridge_ioctl_add(struct bridge_softc *, void *); +static int bridge_ioctl_del(struct bridge_softc *, void *); +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +static int bridge_ioctl_purge(struct bridge_softc *sc, void *arg); +#endif +static int bridge_ioctl_gifflags(struct bridge_softc *, void *); +static int bridge_ioctl_sifflags(struct bridge_softc *, void *); +static int bridge_ioctl_scache(struct bridge_softc *, void *); +static int bridge_ioctl_gcache(struct bridge_softc *, void *); +static int bridge_ioctl_gifs32(struct bridge_softc *, void *); +static int bridge_ioctl_gifs64(struct bridge_softc *, void *); +static int bridge_ioctl_rts32(struct bridge_softc *, void *); +static int bridge_ioctl_rts64(struct bridge_softc *, void *); +static int bridge_ioctl_saddr32(struct bridge_softc *, void *); +static int bridge_ioctl_saddr64(struct bridge_softc *, void *); +static int bridge_ioctl_sto(struct bridge_softc *, void *); +static int bridge_ioctl_gto(struct bridge_softc *, void *); +static int bridge_ioctl_daddr32(struct bridge_softc *, void *); +static int bridge_ioctl_daddr64(struct bridge_softc *, void *); +static int bridge_ioctl_flush(struct bridge_softc *, void *); +static int bridge_ioctl_gpri(struct bridge_softc *, void *); +static int bridge_ioctl_spri(struct bridge_softc *, void *); +static int bridge_ioctl_ght(struct bridge_softc *, void *); +static int bridge_ioctl_sht(struct bridge_softc *, void *); +static int bridge_ioctl_gfd(struct bridge_softc *, void *); +static int bridge_ioctl_sfd(struct bridge_softc *, void *); +static int bridge_ioctl_gma(struct bridge_softc *, void *); +static int bridge_ioctl_sma(struct bridge_softc *, void *); +static int bridge_ioctl_sifprio(struct bridge_softc *, void *); +static int bridge_ioctl_sifcost(struct bridge_softc *, void *); + +struct bridge_control { + int (*bc_func)(struct bridge_softc *, void *); + unsigned int bc_argsize; + unsigned int bc_flags; +}; + +#define BC_F_COPYIN 0x01 /* copy arguments in */ +#define BC_F_COPYOUT 0x02 /* copy arguments out */ +#define BC_F_SUSER 0x04 /* do super-user check */ + +static const struct bridge_control bridge_control_table32[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs32, sizeof(struct ifbifconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts32, sizeof(struct ifbaconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +#endif +}; + +static const struct bridge_control bridge_control_table64[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs64, sizeof(struct ifbifconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts64, sizeof(struct ifbaconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +#endif +}; + +static const unsigned int bridge_control_table_size = +sizeof(bridge_control_table32) / sizeof(bridge_control_table32[0]); + +static LIST_HEAD(, bridge_softc) bridge_list = LIST_HEAD_INITIALIZER(bridge_list); + +static lck_grp_t *bridge_lock_grp = NULL; +static lck_attr_t *bridge_lock_attr = NULL; + +static lck_rw_t *bridge_list_lock = NULL; + + +static struct if_clone bridge_cloner = + IF_CLONE_INITIALIZER("bridge", + bridge_clone_create, + bridge_clone_destroy, + 0, + IF_MAXUNIT); + +#if BRIDGE_DEBUG + +SYSCTL_DECL(_net_link); + +SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Bridge"); + +__private_extern__ int _if_brige_debug = 0; + +SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW, + &_if_brige_debug, 0, "Bridge debug"); + +static void printf_ether_header(struct ether_header *eh); +static void printf_mbuf_data(mbuf_t m, size_t offset, size_t len); +static void printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix); +static void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix); +static void link_print(struct sockaddr_dl * dl_p); + +void +printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) + printf("%spktlen: %u rcvif: %p header: %p nextpkt: %p%s", + prefix ? prefix : "", + (unsigned int)mbuf_pkthdr_len(m), mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), mbuf_nextpkt(m), + suffix ? suffix : ""); + else + printf("%s%s\n", prefix, suffix); +} + +void +printf_mbuf(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) { + printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u datastart: %p next: %p%s", + prefix ? prefix : "", + m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), mbuf_data(m), + (unsigned int)mbuf_maxlen(m), mbuf_datastart(m), mbuf_next(m), + !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix); + if ((mbuf_flags(m) & MBUF_PKTHDR)) + printf_mbuf_pkthdr(m, " ", suffix); + } else + printf("%s%s\n", prefix, suffix); +} + +void +printf_mbuf_data(mbuf_t m, size_t offset, size_t len) +{ + mbuf_t n; + size_t i, j; + size_t pktlen, mlen, maxlen; + unsigned char *ptr; + + pktlen = mbuf_pkthdr_len(m); + + if (offset > pktlen) + return; + + maxlen = (pktlen - offset > len) ? len : pktlen; + n = m; + mlen = mbuf_len(n); + ptr = mbuf_data(n); + for (i = 0, j = 0; i < maxlen; i++, j++) { + if (j >= mlen) { + n = mbuf_next(n); + if (n == 0) + break; + ptr = mbuf_data(n); + mlen = mbuf_len(n); + j = 0; + } + if (i >= offset) { + printf("%02x%s", ptr[j], i % 2 ? " " : ""); + } + } + return; +} + +static void +printf_ether_header(struct ether_header *eh) +{ + printf("%02x:%02x:%02x:%02x:%02x:%02x > %02x:%02x:%02x:%02x:%02x:%02x 0x%04x ", + eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2], + eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5], + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5], + eh->ether_type); +} +#endif /* BRIDGE_DEBUG */ + +/* + * bridgeattach: + * + * Pseudo-device attach routine. + */ +__private_extern__ int +bridgeattach(__unused int n) +{ + int error; + lck_grp_attr_t *lck_grp_attr = NULL; + + bridge_rtnode_pool = zinit(sizeof(struct bridge_rtnode), 1024 * sizeof(struct bridge_rtnode), + 0, "bridge_rtnode"); + + lck_grp_attr = lck_grp_attr_alloc_init(); + + bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr); + + bridge_lock_attr = lck_attr_alloc_init(); + +#if BRIDGE_DEBUG + lck_attr_setdebug(bridge_lock_attr); +#endif + + bridge_list_lock = lck_rw_alloc_init(bridge_lock_grp, bridge_lock_attr); + + // can free the attributes once we've allocated the group lock + lck_grp_attr_free(lck_grp_attr); + + LIST_INIT(&bridge_list); + error = if_clone_attach(&bridge_cloner); + + return error; +} + +#if BRIDGE_DEBUG + +static void +link_print(struct sockaddr_dl * dl_p) +{ + int i; + +#if 1 + printf("sdl len %d index %d family %d type 0x%x nlen %d alen %d" + " slen %d addr ", dl_p->sdl_len, + dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, + dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); +#endif + for (i = 0; i < dl_p->sdl_alen; i++) + printf("%s%x", i ? ":" : "", + (CONST_LLADDR(dl_p))[i]); + printf("\n"); + return; +} +#endif /* BRIDGE_DEBUG */ + + +/* + * bridge_clone_create: + * + * Create a new bridge instance. + */ +/* APPLE MODIFICATION - add opaque argument for cloning. This is done for + net80211's VAP creation (with the Marvell codebase). I think this could end up being useful + for other devices, too. This is not in an ifdef because it doesn't hurt anything to have + this extra param */ +static int +bridge_clone_create(struct if_clone *ifc, int unit) +{ + struct bridge_softc *sc = NULL; + struct ifnet *ifp = NULL; + u_char eaddr[6]; + uint32_t r; + struct ifnet_init_params init_params; + errno_t error = 0; + uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + IFNAMSIZ + ETHER_ADDR_LEN]; + struct sockaddr_dl *sdl = (struct sockaddr_dl *)sdl_buffer; + + sc = _MALLOC(sizeof(*sc), M_DEVBUF, M_WAITOK); + memset(sc, 0, sizeof(*sc)); + + sc->sc_brtmax = BRIDGE_RTABLE_MAX; + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + sc->sc_brtmax_proxysta = BRIDGE_RTABLE_MAX_PROXYSTA; +#endif + sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; + sc->sc_bridge_max_age = BSTP_DEFAULT_MAX_AGE; + sc->sc_bridge_hello_time = BSTP_DEFAULT_HELLO_TIME; + sc->sc_bridge_forward_delay = BSTP_DEFAULT_FORWARD_DELAY; + sc->sc_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY; + sc->sc_hold_time = BSTP_DEFAULT_HOLD_TIME; + sc->sc_filter_flags = IFBF_FILT_DEFAULT; +#ifndef BRIDGE_IPF + /* + * For backwards compatibility with previous behaviour... + * Switch off filtering on the bridge itself if BRIDGE_IPF is + * not defined. + */ + sc->sc_filter_flags &= ~IFBF_FILT_USEIPF; +#endif + + /* Initialize our routing table. */ + error = bridge_rtable_init(sc); + if (error != 0) { + printf("bridge_clone_create: bridge_rtable_init failed %d\n", error); + goto done; + } + + LIST_INIT(&sc->sc_iflist); + + sc->sc_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); + + /* use the interface name as the unique id for ifp recycle */ + snprintf(sc->sc_if_xname, sizeof(sc->sc_if_xname), "%s%d", + ifc->ifc_name, unit); + memset(&init_params, 0, sizeof(struct ifnet_init_params)); + init_params.uniqueid = sc->sc_if_xname; + init_params.uniqueid_len = strlen(sc->sc_if_xname); + init_params.name = ifc->ifc_name; + init_params.unit = unit; + init_params.family = IFNET_FAMILY_ETHERNET; + init_params.type = IFT_BRIDGE; + init_params.output = bridge_start; + init_params.demux = ether_demux; + init_params.add_proto = ether_add_proto; + init_params.del_proto = ether_del_proto; + init_params.check_multi = ether_check_multi; + init_params.framer = ether_frameout; + init_params.softc = sc; + init_params.ioctl = bridge_ioctl; + init_params.set_bpf_tap = bridge_set_bpf_tap; + init_params.detach = bridge_detach; + init_params.broadcast_addr = etherbroadcastaddr; + init_params.broadcast_len = ETHER_ADDR_LEN; + error = ifnet_allocate(&init_params, &ifp); + if (error != 0) { + printf("bridge_clone_create: ifnet_allocate failed %d\n", error); + goto done; + } + sc->sc_if = ifp; + + error = ifnet_set_mtu(ifp, ETHERMTU); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_mtu failed %d\n", error); + goto done; + } + error = ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_addrlen failed %d\n", error); + goto done; + } + error = ifnet_set_baudrate(ifp, 10000000) ; // XXX: this is what IONetworking does + if (error != 0) { + printf("bridge_clone_create: ifnet_set_baudrate failed %d\n", error); + goto done; + } + error = ifnet_set_hdrlen(ifp, ETHER_HDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_hdrlen failed %d\n", error); + goto done; + } + error = ifnet_set_flags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST, + 0xffff); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_flags failed %d\n", error); + goto done; + } + + /* + * Generate a random ethernet address and use the private AC:DE:48 + * OUI code. + */ + read_random(&r, sizeof(r)); + eaddr[0] = 0xAC; + eaddr[1] = 0xDE; + eaddr[2] = 0x48; + eaddr[3] = (r >> 0) & 0xffu; + eaddr[4] = (r >> 8) & 0xffu; + eaddr[5] = (r >> 16) & 0xffu; + + memset(sdl, 0, sizeof(sdl_buffer)); + sdl->sdl_family = AF_LINK; + sdl->sdl_nlen = strlen(sc->sc_if_xname); + sdl->sdl_alen = ETHER_ADDR_LEN; + sdl->sdl_len = offsetof(struct sockaddr_dl, sdl_data); + memcpy(sdl->sdl_data, sc->sc_if_xname, sdl->sdl_nlen); + memcpy(LLADDR(sdl), eaddr, ETHER_ADDR_LEN); + +#if BRIDGE_DEBUG + link_print(sdl); +#endif + + error = ifnet_attach(ifp, NULL); + if (error != 0) { + printf("bridge_clone_create: ifnet_attach failed %d\n", error); + goto done; + } + + error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, IFT_ETHER); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_lladdr_and_type failed %d\n", error); + goto done; + } + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - our bridge can support HW checksums + * (useful if underlying interfaces support them) on TX, + * RX is not that interesting, since the stack just looks to + * see if the packet has been checksummed already (I think) + * but we might as well indicate we support it + */ + ifp->if_capabilities = + IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx | + IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx ; +#endif + + lck_rw_lock_exclusive(bridge_list_lock); + LIST_INSERT_HEAD(&bridge_list, sc, sc_list); + lck_rw_done(bridge_list_lock); + + /* attach as ethernet */ + error = bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header), NULL, NULL); + +done: + if (error != 0) { + printf("bridge_clone_create failed error %d\n", error); + /* Cleanup TBD */ + } + + return error; +} + +/* + * bridge_clone_destroy: + * + * Destroy a bridge instance. + */ +static void +bridge_clone_destroy(struct ifnet *ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + struct bridge_iflist *bif; + int error; + + lck_mtx_lock(sc->sc_mtx); + if ((sc->sc_flags & SCF_DETACHING)) { + lck_mtx_unlock(sc->sc_mtx); + return; + } + sc->sc_flags |= SCF_DETACHING; + + bridge_stop(ifp, 1); + + error = ifnet_set_flags(ifp, 0, IFF_UP); + if (error != 0) { + printf("bridge_clone_destroy: ifnet_set_flags failed %d\n", error); + } + + while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL) + bridge_delete_member(sc, bif); + + lck_mtx_unlock(sc->sc_mtx); + + error = ifnet_detach(ifp); + if (error != 0) { + printf("bridge_clone_destroy: ifnet_detach failed %d\n", error); + if ((sc = (struct bridge_softc *)ifnet_softc(ifp)) != NULL) { + lck_mtx_lock(sc->sc_mtx); + sc->sc_flags &= ~SCF_DETACHING; + lck_mtx_unlock(sc->sc_mtx); + } + } + + return; +} + +#define DRVSPEC \ + if (ifd->ifd_cmd >= bridge_control_table_size) { \ + error = EINVAL; \ + break; \ + } \ + bc = &bridge_control_table[ifd->ifd_cmd]; \ + \ + if ((cmd & IOC_DIRMASK) == IOC_INOUT && \ + (bc->bc_flags & BC_F_COPYOUT) == 0) { \ + error = EINVAL; \ + break; \ + } \ + else if (((cmd & IOC_DIRMASK) == IOC_IN) && \ + (bc->bc_flags & BC_F_COPYOUT) != 0) { \ + error = EINVAL; \ + break; \ + } \ + \ + if (bc->bc_flags & BC_F_SUSER) { \ + error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER); \ + if (error) \ + break; \ + } \ + \ + if (ifd->ifd_len != bc->bc_argsize || \ + ifd->ifd_len > sizeof(args)) { \ + error = EINVAL; \ + break; \ + } \ + \ + memset(&args, 0, sizeof(args)); \ + if (bc->bc_flags & BC_F_COPYIN) { \ + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \ + if (error) \ + break; \ + } \ + \ + lck_mtx_lock(sc->sc_mtx); \ + error = (*bc->bc_func)(sc, &args); \ + lck_mtx_unlock(sc->sc_mtx); \ + if (error) \ + break; \ + \ + if (bc->bc_flags & BC_F_COPYOUT) \ + error = copyout(&args, ifd->ifd_data, ifd->ifd_len) + +/* + * bridge_ioctl: + * + * Handle a control request from the operator. + */ +static errno_t +bridge_ioctl(ifnet_t ifp, unsigned long cmd, void *data) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + +#if BRIDGE_DEBUG + printf("bridge_ioctl: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char)IOCGROUP(cmd), + cmd & 0xff); + printf("SIOCGDRVSPEC32 %lx SIOCGDRVSPEC64 %lx\n", SIOCGDRVSPEC32, SIOCGDRVSPEC64); +#endif + + switch (cmd) { + case SIOCADDMULTI: + break; + case SIOCDELMULTI: + break; + + case SIOCSDRVSPEC32: + case SIOCGDRVSPEC32: { + union { + struct ifbreq ifbreq; + struct ifbifconf32 ifbifconf; + struct ifbareq32 ifbareq; + struct ifbaconf32 ifbaconf; + struct ifbrparam ifbrparam; + } args; + struct ifdrv32 *ifd = (struct ifdrv32 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table32, *bc; + + DRVSPEC; + + break; + } + case SIOCSDRVSPEC64: + case SIOCGDRVSPEC64: { + union { + struct ifbreq ifbreq; + struct ifbifconf64 ifbifconf; + struct ifbareq64 ifbareq; + struct ifbaconf64 ifbaconf; + struct ifbrparam ifbrparam; + } args; + struct ifdrv64 *ifd = (struct ifdrv64 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table64, *bc; + + DRVSPEC; + + break; + } + + case SIOCSIFFLAGS: + if ((ifnet_flags(ifp) & (IFF_UP|IFF_RUNNING)) == IFF_RUNNING) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + lck_mtx_lock(sc->sc_mtx); + bridge_stop(ifp, 1); + lck_mtx_unlock(sc->sc_mtx); + } else if ((ifnet_flags(ifp) & (IFF_UP|IFF_RUNNING)) == IFF_UP) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + lck_mtx_lock(sc->sc_mtx); + error = bridge_init(ifp); + lck_mtx_unlock(sc->sc_mtx); + } + break; + + case SIOCSIFMTU: +#if 0 + /* APPLE MODIFICATION + if we wanted to support changing the MTU */ + { + struct ifreq *ifr = (struct ifreq *)data; + struct bridge_iflist *bif; + struct ifnet *dst_if; + sc->sc_if.if_mtu = ifr->ifr_mtu; + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + error = ifnet_ioctl(dst_if, 0, cmd, data); + if (error) + break; + } + } +#else + /* Do not allow the MTU to be changed on the bridge */ + error = EINVAL; +#endif + break; + + /* APPLE MODIFICATION - don't pass this down to ether_ioctl, just indicate we don't handle it */ + case SIOCGIFMEDIA: + error = EINVAL; + break; + + case SIOCSIFLLADDR: + error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); + if (error != 0) + printf("bridge_ioctl: ifnet_set_lladdr failed %d\n", error); + break; + + default: + error = ether_ioctl(ifp, cmd, data); +#if BRIDGE_DEBUG + if (error != 0) + printf("bridge_ioctl: ether_ioctl ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu) failed error: %d\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char) IOCGROUP(cmd), + cmd & 0xff, + error); +#endif /* BRIDGE_DEBUG */ + break; + } + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return (error); +} + +/* + * bridge_mutecaps: + * + * Clear or restore unwanted capabilities on the member interface + */ +#if HAS_IF_CAP +void +bridge_mutecaps(struct bridge_iflist *bif, int mute) +{ + struct ifnet *ifp = bif->bif_ifp; + struct ifcapreq ifcr; + + if (ifp->if_ioctl == NULL) + return; + + memset(&ifcr, 0, sizeof(ifcr)); + ifcr.ifcr_capenable = ifp->if_capenable; + + if (mute) { + /* mask off and save capabilities */ + bif->bif_mutecap = ifcr.ifcr_capenable & BRIDGE_IFCAPS_MASK; + if (bif->bif_mutecap != 0) + ifcr.ifcr_capenable &= ~BRIDGE_IFCAPS_MASK; + } else + /* restore muted capabilities */ + ifcr.ifcr_capenable |= bif->bif_mutecap; + + if (bif->bif_mutecap != 0) { + (void) (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifcr); + } +} +#endif /* HAS_IF_CAP */ + +/* + * bridge_lookup_member: + */ +static struct bridge_iflist * +bridge_lookup_member(struct bridge_softc *sc, const char *name) +{ + struct bridge_iflist *bif; + struct ifnet *ifp; + char if_xname[IFNAMSIZ]; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + ifp = bif->bif_ifp; + snprintf(if_xname, sizeof(if_xname), "%s%d", + ifnet_name(ifp), ifnet_unit(ifp)); + if (strncmp(if_xname, name, sizeof(if_xname)) == 0) + return (bif); + } + + return (NULL); +} + +/* + * bridge_lookup_member_if: + */ +static struct bridge_iflist * +bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) +{ + struct bridge_iflist *bif; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp == member_ifp) + return (bif); + } + + return (NULL); +} + +static errno_t +bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + mbuf_t *data, char **frame_ptr) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + int included = 0; + size_t frmlen = 0; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + + if (*frame_ptr >= (char *)mbuf_datastart(m) && *frame_ptr <= (char *)mbuf_data(m)) { + included = 1; + frmlen = (char *)mbuf_data(m) - *frame_ptr; + } +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_input %s%d from %s%d m %p data %p frame %p %s frmlen %lu\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m), *frame_ptr, included ? "inside" : "outside", frmlen); + + if (_if_brige_debug > 1) { + printf_mbuf(m, "bridge_iff_input[", "\n"); + printf_ether_header((struct ether_header *)*frame_ptr); + printf_mbuf_data(m, 0, 20); + printf("\n"); + } + } +#endif /* BRIDGE_DEBUG */ + + /* Move data pointer to start of frame to the link layer header */ + if (included) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, mbuf_len(m) + frmlen); + (void) mbuf_pkthdr_adjustlen(m, frmlen); + } else { + printf("bridge_iff_input: frame_ptr outside mbuf\n"); + goto out; + } + + error = bridge_input(bif, ifp, m, *frame_ptr); + + /* Adjust packet back to original */ + if (error == 0) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen); + (void) mbuf_pkthdr_adjustlen(m, -frmlen); + } +#if BRIDGE_DEBUG + if (_if_brige_debug > 1) { + printf("\n"); + printf_mbuf(m, "bridge_iff_input]", "\n"); + } +#endif /* BRIDGE_DEBUG */ + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} + + +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t +bridge_iff_output(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, mbuf_t *data) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_output %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); + } +#endif /* BRIDGE_DEBUG */ + + error = bridge_output(sc, ifp, m); + if (error != 0) { + printf("bridge_iff_output: bridge_output failed error %d\n", error); + } + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + + +static void +bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + const struct kev_msg *event_msg) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + + if (event_msg->vendor_code == KEV_VENDOR_APPLE && + event_msg->kev_class == KEV_NETWORK_CLASS && + event_msg->kev_subclass == KEV_DL_SUBCLASS) { + switch (event_msg->event_code) { + case KEV_DL_IF_DETACHING: + bridge_ifdetach(bif, ifp); + break; + + default: + break; + } + } +} + +static void +bridge_iff_detached(void* cookie, __unused ifnet_t interface) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + + _FREE(bif, M_DEVBUF); + + return; +} + +/* + * bridge_delete_member: + * + * Delete the specified member interface. + */ +static void +bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + struct ifnet *ifs = bif->bif_ifp; + + switch (ifnet_type(ifs)) { + case IFT_ETHER: + /* + * Take the interface out of promiscuous mode. + */ + (void) ifnet_set_promiscuous(ifs, 0); + break; +#if NGIF > 0 + case IFT_GIF: + break; +#endif + default: +#ifdef DIAGNOSTIC + panic("bridge_delete_member: impossible"); +#endif + break; + } + + ifs->if_bridge = NULL; + LIST_REMOVE(bif, bif_next); + + /* Respect lock ordering with DLIL lock */ + lck_mtx_unlock(sc->sc_mtx); + iflt_detach(bif->bif_iff_ref); + lck_mtx_lock(sc->sc_mtx); + + bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + /* On the last deleted interface revert the MTU */ + + if (LIST_EMPTY(&sc->sc_iflist)) + (void) ifnet_set_mtu(sc->sc_if, ETHERMTU); +} + +static int +bridge_ioctl_add(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + int error = 0; + /* APPLE MODIFICATION - is this a proxy sta being added? */ +#if IEEE80211_PROXYSTA + struct bridge_rtnode *brt; +#endif + + error = ifnet_find_by_name(req->ifbr_ifsname, &ifs); + if (error || ifs == NULL) + return (ENOENT); + + /* Is the interface already attached to this bridge interface */ + if (ifs->if_bridge == sc) + return (EEXIST); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + /* First added interface resets the MTU */ + + if (LIST_EMPTY(&sc->sc_iflist)) + (void) ifnet_set_mtu(sc->sc_if, ETHERMTU); + + if (ifnet_mtu(sc->sc_if) != ifnet_mtu(ifs)) + return (EINVAL); + + bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_WAITOK|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; + bif->bif_priority = BSTP_DEFAULT_PORT_PRIORITY; + bif->bif_path_cost = BSTP_DEFAULT_PATH_COST; + bif->bif_sc = sc; + + switch (ifnet_type(ifs)) { + case IFT_ETHER: + /* + * Place the interface into promiscuous mode. + */ + error = ifnet_set_promiscuous(ifs, 1); + if (error) + goto out; +#if HAS_IF_CAP + bridge_mutecaps(bif, 1); +#endif + break; +#if NGIF > 0 + case IFT_GIF: + break; +#endif + default: + error = EINVAL; + goto out; + } + + /* + * If the LINK0 flag is set, and this is the first member interface, + * attempt to inherit its link-layer address. + */ + if ((ifnet_flags(sc->sc_if) & IFF_LINK0) && LIST_EMPTY(&sc->sc_iflist) && + ifnet_type(ifs) == IFT_ETHER) { + (void) ifnet_set_lladdr(sc->sc_if, ifnet_lladdr(ifs), + ETHER_ADDR_LEN); + } + + // install an interface filter + { + struct iff_filter iff; + + memset(&iff, 0, sizeof(struct iff_filter)); + + iff.iff_cookie = bif; + iff.iff_name = "com.apple.kernel.bsd.net.if_bridge"; + iff.iff_input = bridge_iff_input; +#if BRIDGE_MEMBER_OUT_FILTER + iff.iff_output = bridge_iff_output; +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + iff.iff_event = bridge_iff_event; + iff.iff_detached = bridge_iff_detached; + + /* Respect lock ordering with DLIL lock */ + lck_mtx_unlock(sc->sc_mtx); + error = iflt_attach(ifs, &iff, &bif->bif_iff_ref); + lck_mtx_lock(sc->sc_mtx); + if (error != 0) { + printf("bridge_ioctl_add: iflt_attach failed %d\n", error); + goto out; + } + } + ifs->if_bridge = sc; + LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next); + + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + else + bstp_stop(sc); + + /* APPLE MODIFICATION - is this a proxy sta being added? */ +#if IEEE80211_PROXYSTA + brt = bridge_rtnode_lookup(sc, ifnet_lladdr(ifs)); + if (brt) { +#if DIAGNOSTIC + printf( "%s: attach %s to bridge as proxysta for %02x:%02x:%02x:%02x:%02x:%02x discovered on %s\n", + __func__, ifs->if_xname, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], + brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname ); +#endif + brt->brt_ifp_proxysta = ifs; + } +#endif + + +out: + if (error) { + if (bif != NULL) + _FREE(bif, M_DEVBUF); + } + return (error); +} + +static int +bridge_ioctl_del(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bridge_delete_member(sc, bif); + + return (0); +} + +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +static int +bridge_ioctl_purge(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct ifnet *ifs; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + ifs = bif->bif_ifp; + bridge_rtpurge(sc, ifs); + + return (0); +} +#endif + +static int +bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + req->ifbr_ifsflags = bif->bif_flags; + req->ifbr_state = bif->bif_state; + req->ifbr_priority = bif->bif_priority; + req->ifbr_path_cost = bif->bif_path_cost; + req->ifbr_portno = ifnet_index(bif->bif_ifp) & 0xffff; + + return (0); +} + +static int +bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + if (req->ifbr_ifsflags & IFBIF_STP) { + switch (ifnet_type(bif->bif_ifp)) { + case IFT_ETHER: + /* These can do spanning tree. */ + break; + + default: + /* Nothing else can. */ + return (EINVAL); + } + } + + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + if ((bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) && + ((req->ifbr_ifsflags & IFBIF_PROXYSTA_DISCOVER) == 0)) + bridge_rtpurge(sc, bif->bif_ifp); +#endif + + bif->bif_flags = req->ifbr_ifsflags; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + if (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) + bridge_rtdiscovery(sc); +#endif + + return (0); +} + +static int +bridge_ioctl_scache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brtmax = param->ifbrp_csize; + bridge_rttrim(sc); + + return (0); +} + +static int +bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_csize = sc->sc_brtmax; + + return (0); +} + +#define BRIDGE_IOCTL_GIFS \ + struct bridge_iflist *bif; \ + struct ifbreq breq; \ + int count, error = 0; \ + uint32_t len; \ + \ + count = 0; \ + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) \ + count++; \ + \ + if (bifc->ifbic_len == 0) { \ + bifc->ifbic_len = sizeof(breq) * count; \ + return (0); \ + } \ + \ + count = 0; \ + len = bifc->ifbic_len; \ + memset(&breq, 0, sizeof breq); \ + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof(breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ + ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ + breq.ifbr_ifsflags = bif->bif_flags; \ + breq.ifbr_state = bif->bif_state; \ + breq.ifbr_priority = bif->bif_priority; \ + breq.ifbr_path_cost = bif->bif_path_cost; \ + breq.ifbr_portno = ifnet_index(bif->bif_ifp) & 0xffff; \ + error = copyout(&breq, bifc->ifbic_req + count * sizeof(breq), sizeof(breq)); \ + if (error) \ + break; \ + count++; \ + len -= sizeof(breq); \ + } \ + \ + bifc->ifbic_len = sizeof(breq) * count + + +static int +bridge_ioctl_gifs64(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf64 *bifc = arg; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + +static int +bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf32 *bifc = arg; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + +#define BRIDGE_IOCTL_RTS \ + struct bridge_rtnode *brt; \ + int count = 0, error = 0; \ + uint32_t len; \ + struct timespec now; \ + \ + if (bac->ifbac_len == 0) \ + return (0); \ + \ + len = bac->ifbac_len; \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \ + if (len < sizeof(bareq)) \ + goto out; \ + memset(&bareq, 0, sizeof(bareq)); \ + snprintf(bareq.ifba_ifsname, sizeof(bareq.ifba_ifsname), "%s%d", \ + ifnet_name(brt->brt_ifp), ifnet_unit(brt->brt_ifp)); \ + memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); \ + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \ + nanouptime(&now); \ + if (brt->brt_expire >= (unsigned long)now.tv_sec) \ + bareq.ifba_expire = brt->brt_expire - now.tv_sec; \ + else \ + bareq.ifba_expire = 0; \ + } else \ + bareq.ifba_expire = 0; \ + bareq.ifba_flags = brt->brt_flags; \ + \ + error = copyout(&bareq, bac->ifbac_req + count * sizeof(bareq), sizeof(bareq)); \ + if (error) \ + goto out; \ + count++; \ + len -= sizeof(bareq); \ + } \ +out: \ + bac->ifbac_len = sizeof(bareq) * count + + +static int +bridge_ioctl_rts64(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf64 *bac = arg; + struct ifbareq64 bareq; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_rts32(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf32 *bac = arg; + struct ifbareq32 bareq; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, bif->bif_ifp, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, bif->bif_ifp, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_sto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brttimeout = param->ifbrp_ctime; + + return (0); +} + +static int +bridge_ioctl_gto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_ctime = sc->sc_brttimeout; + + return (0); +} + +static int +bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst)); +} + +static int +bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst)); +} + +static int +bridge_ioctl_flush(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + + bridge_rtflush(sc, req->ifbr_ifsflags); + + return (0); +} + +static int +bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_prio = sc->sc_bridge_priority; + + return (0); +} + +static int +bridge_ioctl_spri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_bridge_priority = param->ifbrp_prio; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +static int +bridge_ioctl_ght(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_hellotime = sc->sc_bridge_hello_time >> 8; + + return (0); +} + +static int +bridge_ioctl_sht(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + if (param->ifbrp_hellotime == 0) + return (EINVAL); + sc->sc_bridge_hello_time = param->ifbrp_hellotime << 8; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +static int +bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_fwddelay = sc->sc_bridge_forward_delay >> 8; + + return (0); +} + +static int +bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + if (param->ifbrp_fwddelay == 0) + return (EINVAL); + sc->sc_bridge_forward_delay = param->ifbrp_fwddelay << 8; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +static int +bridge_ioctl_gma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_maxage = sc->sc_bridge_max_age >> 8; + + return (0); +} + +static int +bridge_ioctl_sma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + if (param->ifbrp_maxage == 0) + return (EINVAL); + sc->sc_bridge_max_age = param->ifbrp_maxage << 8; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +static int +bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bif->bif_priority = req->ifbr_priority; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +static void +bridge_proxysta_notify_macaddr(struct ifnet *ifp, int op, const uint8_t *mac) +{ + struct proxy_sta_event iev; + + memset(&iev, 0, sizeof(iev)); + memcpy(iev.iev_addr, mac, ETHER_ADDR_LEN); + + rt_proxystamsg(ifp, op, &iev, sizeof(iev)); +} + +static void +bridge_proxysta_discover(struct ifnet *ifp, const uint8_t *mac) +{ + bridge_proxysta_notify_macaddr( ifp, RTM_PROXYSTA_DISCOVERY, mac ); +} + +static void +bridge_proxysta_idle_timeout(struct ifnet *ifp, const uint8_t *mac) +{ + bridge_proxysta_notify_macaddr( ifp, RTM_PROXYSTA_IDLE_TIMEOUT, mac ); +} +#endif + +static int +bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bif->bif_path_cost = req->ifbr_path_cost; + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) + bstp_initialization(sc); + + return (0); +} + +/* + * bridge_ifdetach: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +static void +bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) +{ + struct bridge_softc *sc = bif->bif_sc; + struct ifbreq breq; + + memset(&breq, 0, sizeof(breq)); + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", + ifnet_name(ifp), ifnet_unit(ifp)); + + lck_mtx_lock(sc->sc_mtx); + + (void) bridge_ioctl_del(sc, &breq); + + lck_mtx_unlock(sc->sc_mtx); +} + +/* + * bridge_init: + * + * Initialize a bridge interface. + */ +static int +bridge_init(struct ifnet *ifp) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + struct timespec ts; + errno_t error; + + if (ifnet_flags(ifp) & IFF_RUNNING) + return (0); + + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + + error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); + if (error == 0) + bstp_initialization(sc); + + return error; +} + +/* + * bridge_stop: + * + * Stop the bridge interface. + */ +static void +bridge_stop(struct ifnet *ifp, __unused int disable) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + + if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) + return; + + bsd_untimeout(bridge_timer, sc); + bstp_stop(sc); + + bridge_rtflush(sc, IFBF_FLUSHDYN); + + (void) ifnet_set_flags(ifp, 0, IFF_RUNNING); +} + +/* + * bridge_enqueue: + * + * Enqueue a packet on a bridge member interface. + * + * Note: this is called both on the input and output path so this routine + * cannot simply muck with the HW checksum flag. For the time being we + * rely on the caller to do the right thing. + */ +__private_extern__ void +bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +{ + int len, error; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_enqueue sc %s%d to dst_ifp %s%d m %p\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), + ifnet_name(dst_ifp), ifnet_unit(dst_ifp), m); +#endif /* BRIDGE_DEBUG */ + + len = m->m_pkthdr.len; + m->m_flags |= M_PROTO1; //set to avoid loops + + error = ifnet_output_raw(dst_ifp, 0, m); + if (error == 0) { + (void) ifnet_stat_increment_out(sc->sc_if, 1, len, 0); + } else { + (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1); + } + + return; +} + + +#if BRIDGE_MEMBER_OUT_FILTER + +/* + * bridge_output: + * + * Send output from a bridge member interface. This + * performs the bridging function for locally originated + * packets. + * + * The mbuf has the Ethernet header already attached. We must + * enqueue or free the mbuf before returning. + */ +static int +bridge_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) +{ + struct ether_header *eh; + struct ifnet *dst_if; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_output ifp %p %s%d\n", ifp, ifnet_name(ifp), ifnet_unit(ifp)); +#endif /* BRIDGE_DEBUG */ + + if (m->m_len < ETHER_HDR_LEN) { + m = m_pullup(m, ETHER_HDR_LEN); + if (m == NULL) { + printf("bridge_output ifp %p m_pullup failed\n", ifp); + return EJUSTRETURN; + } + } + + eh = mtod(m, struct ether_header *); + + /* APPLE MODIFICATION + * If the packet is an 802.1X ethertype, then only send on the + * original output interface. + */ + if (eh->ether_type == htons(ETHERTYPE_PAE)) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If bridge is down, but the original output interface is up, + * go ahead and send out that interface. Otherwise, the packet + * is dropped below. + */ + if ((ifnet_flags(sc->sc_if) & IFF_RUNNING) == 0) { + dst_if = ifp; + goto sendunicast; + } + + lck_mtx_lock(sc->sc_mtx); + + /* + * If the packet is a multicast, or we don't know a better way to + * get there, send to all interfaces. + */ + if (ETHER_IS_MULTICAST(eh->ether_dhost)) + dst_if = NULL; + else + dst_if = bridge_rtlookup(sc, eh->ether_dhost); + if (dst_if == NULL) { + struct bridge_iflist *bif; + struct mbuf *mc; + int used = 0; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) + continue; + + /* + * If this is not the original output interface, + * and the interface is participating in spanning + * tree, make sure the port is in a state that + * allows forwarding. + */ + if (dst_if != ifp && + (bif->bif_flags & IFBIF_STP) != 0) { + switch (bif->bif_state) { + case BSTP_IFSTATE_BLOCKING: + case BSTP_IFSTATE_LISTENING: + case BSTP_IFSTATE_DISABLED: + continue; + } + } + + if (LIST_NEXT(bif, bif_next) == NULL) { + used = 1; + mc = m; + } else { + mc = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (mc == NULL) { + printf("bridge_output ifp %p m_copym failed\n", ifp); + (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1); + continue; + } + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) { + printf("bridge_output ifp %p not used\n", ifp); + m_freem(m); + } + lck_mtx_unlock(sc->sc_mtx); + + return EJUSTRETURN; + } + +sendunicast: + /* + * XXX Spanning tree consideration here? + */ + + if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) { + printf("bridge_output ifp %p dst_if %p not running\n", ifp, dst_if); + m_freem(m); + + return EJUSTRETURN; + } + + if (dst_if != ifp) { + lck_mtx_lock(sc->sc_mtx); + + bridge_enqueue(sc, dst_if, m); + + lck_mtx_unlock(sc->sc_mtx); + + return EJUSTRETURN; + } + + return (0); +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +static struct mbuf* bridge_fix_txcsum( struct mbuf *m ) +{ + // basic tests indicate that the vast majority of packets being processed + // here have an Ethernet header mbuf pre-pended to them (the first case below) + // the second highest are those where the Ethernet and IP/TCP/UDP headers are + // all in one mbuf (second case below) + // the third case has, in fact, never hit for me -- although if I comment out + // the first two cases, that code works for them, so I consider it a + // decent general solution + + int amt = ETHER_HDR_LEN; + int hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); + int off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + /* + * NOTE we should never get vlan-attached packets here; + * support for those COULD be added, but we don't use them + * and it really kinda slows things down to worry about them + */ + +#ifdef DIAGNOSTIC + if ( m_tag_find( m, PACKET_TAG_VLAN, NULL ) != NULL ) + { + printf( "bridge: transmitting packet tagged with VLAN?\n" ); + KASSERT( 0 ); + m_freem( m ); + return NULL; + } +#endif + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_len == ETHER_HDR_LEN ) + { + // this is the case where there's an Ethernet header in an mbuf + + // the first mbuf is the Ethernet header -- just strip it off and do the checksum + struct mbuf *m_ip = m->m_next; + + // set up m_ip so the cksum operations work + /* APPLE MODIFICATION 22 Apr 2008 + * Clear the m_tag list before setting + * M_PKTHDR. + * + * If this m_buf chain was extended via M_PREPEND(), then + * m_ip->m_pkthdr is identical to m->m_pkthdr (see + * M_MOVE_PKTHDR()). The only thing preventing access to this + * invalid packet header data is the fact that the M_PKTHDR + * flag is clear, i.e., m_ip->m_flag & M_PKTHDR == 0, but we're + * about to set the M_PKTHDR flag, so to be safe we initialize, + * more accurately, we clear, m_ip->m_pkthdr.tags via + * m_tag_init(). + * + * Suppose that we do not do this; if m_pullup(), below, fails, + * then m_ip will be freed along with m_ip->m_pkthdr.tags, but + * we will also free m soon after, via m_freem(), and + * consequently attempt to free m->m_pkthdr.tags in the + * process. The problem is that m->m_pkthdr.tags will have + * already been freed by virtue of being equal to + * m_ip->m_pkthdr.tags. Attempts to dereference + * m->m_pkthdr.tags in m_tag_delete_chain() will result in a + * panic. + */ + m_tag_init(m_ip); + /* END MODIFICATION */ + m_ip->m_flags |= M_PKTHDR; + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + m_ip->m_pkthdr.len = m->m_pkthdr.len - ETHER_HDR_LEN; + + // set up the header mbuf so we can prepend it back on again later + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.len = ETHER_HDR_LEN; + m->m_next = NULL; + + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else if ( m->m_len >= amt ) + { + // everything fits in the first mbuf, so futz with m->m_data, m->m_len and m->m_pkthdr.len to + // make it work + m->m_len -= ETHER_HDR_LEN; + m->m_data += ETHER_HDR_LEN; + m->m_pkthdr.len -= ETHER_HDR_LEN; + + // now do the checksums we need -- first IP + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + struct ip *ip = mtod( m, struct ip* ); + ip->ip_sum = in_cksum( m, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + + // now do a TCP or UDP delayed checksum + if ( m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now stick the ethernet header back on + m->m_len += ETHER_HDR_LEN; + m->m_data -= ETHER_HDR_LEN; + m->m_pkthdr.len += ETHER_HDR_LEN; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else + { + struct mbuf *m_ip; + + // general case -- need to simply split it off and deal + + // first, calculate how much needs to be made writable (we may have a read-only mbuf here) + hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); +#if PARANOID + off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += sizeof( struct tcphdr * ); + amt += off; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += sizeof( struct udphdr * ); + amt += off; + } +#endif + + // now split the ethernet header off of the IP packet (we'll re-attach later) + m_ip = m_split( m, ETHER_HDR_LEN, M_NOWAIT ); + if ( m_ip == NULL ) + { + printf( "bridge_fix_txcsum: could not split ether header\n" ); + + m_freem( m ); + return NULL; + } + +#if PARANOID + // make sure that the IP packet is writable for the portion we need + if ( m_makewritable( &m_ip, 0, amt, M_DONTWAIT ) != 0 ) + { + printf( "bridge_fix_txcsum: could not make %d bytes writable\n", amt ); + + m_freem( m ); + m_freem( m_ip ); + return NULL; + } +#endif + + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + + return m; +} +#endif + +/* + * bridge_start: + * + * Start output on a bridge. + */ +static errno_t +bridge_start(ifnet_t ifp, mbuf_t m) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + struct ether_header *eh; + struct ifnet *dst_if; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + eh = mtod(m, struct ether_header *); + + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 && + (dst_if = bridge_rtlookup(sc, eh->ether_dhost)) != NULL) { + + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - if the packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND the destination + * interface doesn't support HW checksums, then we + * need to fix-up the checksum here + */ + if ( + ( (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4) ) != 0 ) && + ( (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags ) != m->m_pkthdr.csum_flags ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } + +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + lck_mtx_lock(sc->sc_mtx); + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + bridge_enqueue(sc, dst_if, m); + lck_mtx_unlock(sc->sc_mtx); + } + } else + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + + /* + * APPLE MODIFICATION - if the MULTICAST packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND at least one destination + * interface doesn't support HW checksums, then we go ahead and fix it up + * here, since it doesn't make sense to do it more than once + */ + + if ( + (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) && + /* + * XXX FIX ME: keep track of whether or not we have any interfaces that + * do not support checksums (for now, assume we do) + */ + ( 1 ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + + lck_mtx_lock(sc->sc_mtx); + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + bridge_broadcast(sc, ifp, m, 0); + lck_mtx_unlock(sc->sc_mtx); + } +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +done: +#endif + + return 0; +} + +/* + * bridge_forward: + * + * The forwarding function of the bridge. + */ +static void +bridge_forward(struct bridge_softc *sc, struct mbuf *m) +{ + struct bridge_iflist *bif; + struct ifnet *src_if, *dst_if; + struct ether_header *eh; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_forward %s%d m%p\n", ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), m); +#endif /* BRIDGE_DEBUG */ + + src_if = m->m_pkthdr.rcvif; + + (void) ifnet_stat_increment_in(sc->sc_if, 1, m->m_pkthdr.len, 0); + + /* + * Look up the bridge_iflist. + */ + bif = bridge_lookup_member_if(sc, src_if); + if (bif == NULL) { + /* Interface is not a bridge member (anymore?) */ + m_freem(m); + return; + } + + /* APPLE MODIFICATION - add the ability to block forwarding of packets; for the guest network */ +#if ( APPLE_HAVE_80211_GUEST_NETWORK ) + if (bif->bif_flags & IFBIF_NO_FORWARDING) { + /* Drop the packet and we're done. */ + m_freem(m); + return; + } +#endif + + if (bif->bif_flags & IFBIF_STP) { + switch (bif->bif_state) { + case BSTP_IFSTATE_BLOCKING: + case BSTP_IFSTATE_LISTENING: + case BSTP_IFSTATE_DISABLED: + m_freem(m); + return; + } + } + + eh = mtod(m, struct ether_header *); + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + src_if, 0, IFBAF_DYNAMIC); + } + + if ((bif->bif_flags & IFBIF_STP) != 0 && + bif->bif_state == BSTP_IFSTATE_LEARNING) { + m_freem(m); + return; + } + + /* + * At this point, the port either doesn't participate + * in spanning tree or it is in the forwarding state. + */ + + /* + * If the packet is unicast, destined for someone on + * "this" side of the bridge, drop it. + */ + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + /* APPLE MODIFICATION - if the packet came in on a proxy sta discovery interface, + we need to not look up the node by DA of the packet; we need to look up the proxy sta which + matches the SA. If it's not found yet, drop the packet. */ +#if IEEE80211_PROXYSTA + if (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) + { + struct bridge_rtnode *brt; + dst_if = NULL; + brt = bridge_rtnode_lookup(sc, eh->ether_shost); + if (brt) { + dst_if = brt->brt_ifp_proxysta; + } + if (dst_if == NULL) { + m_freem(m); + return; + } + } + else +#endif + dst_if = bridge_rtlookup(sc, eh->ether_dhost); + if (src_if == dst_if) { + m_freem(m); + return; + } + } else { + /* ...forward it to all interfaces. */ + sc->sc_if->if_imcasts++; + dst_if = NULL; + } + + /* APPLE MODIFICATION + - this is now handled by bridge_input + - turning this back on because all packets are not bpf_mtap'd + equally. RSN Preauth were not getting through; we're + conditionalizing this call on + (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH)) + */ +#if 1 + if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH)) + { + m->m_pkthdr.rcvif = sc->sc_if; +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(sc->sc_if, m); +#endif + } +#endif + + if (dst_if == NULL) { + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * Clear any in-bound checksum flags for this packet. + */ + m->m_pkthdr.csum_flags = 0; +#else + mbuf_inbound_modified(m); +#endif + + bridge_broadcast(sc, src_if, m, 1); + return; + } + + /* + * At this point, we're dealing with a unicast frame + * going to a different interface. + */ + if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) { + m_freem(m); + return; + } + bif = bridge_lookup_member_if(sc, dst_if); + if (bif == NULL) { + /* Not a member of the bridge (anymore?) */ + m_freem(m); + return; + } + + if (bif->bif_flags & IFBIF_STP) { + switch (bif->bif_state) { + case BSTP_IFSTATE_DISABLED: + case BSTP_IFSTATE_BLOCKING: + m_freem(m); + return; + } + } + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * Clear any in-bound checksum flags for this packet. + */ + { + m->m_pkthdr.csum_flags = 0; + } +#else + mbuf_inbound_modified(m); +#endif + + bridge_enqueue(sc, dst_if, m); +} + +char * ether_ntop(char *, size_t , const u_char *); + +__private_extern__ char * +ether_ntop(char *buf, size_t len, const u_char *ap) +{ + snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", + ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]); + + return buf; +} + +/* + * bridge_input: + * + * Receive input from a member interface. Queue the packet for + * bridging if it is not for us. + */ +errno_t +bridge_input(struct bridge_iflist *bif, struct ifnet *ifp, struct mbuf *m, void *frame_header) +{ + struct ifnet *bifp; + struct ether_header *eh; + struct mbuf *mc; + int is_for_us = 0; + struct bridge_softc *sc = bif->bif_sc; + struct bridge_iflist *brm; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_input: %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); +#endif /* BRIDGE_DEBUG */ + + if ((ifnet_flags(sc->sc_if) & IFF_RUNNING) == 0) { +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d not running passing along\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + /* Need to clear the promiscous flags otherwise it will be dropped by DLIL after processing filters */ + if ((mbuf_flags(m) & MBUF_PROMISC)) + mbuf_setflags_mask(m, 0, MBUF_PROMISC); + + lck_mtx_lock(sc->sc_mtx); + + bifp = sc->sc_if; + + /* Is it a good idea to reassign a new value to bif ? TBD */ + bif = bridge_lookup_member_if(sc, ifp); + if (bif == NULL) { + lck_mtx_unlock(sc->sc_mtx); +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d bridge_lookup_member_if failed\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + eh = (struct ether_header *)mbuf_data(m); + + /* + * If the packet is for us, set the packets source as the + * bridge, and return the packet back to ether_input for + * local processing. + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(bifp), + ETHER_ADDR_LEN) == 0) { + + /* Mark the packet as arriving on the bridge interface */ + (void) mbuf_pkthdr_setrcvif(m, bifp); + mbuf_pkthdr_setheader(m, frame_header); + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + ifp, 0, IFBAF_DYNAMIC); + } + +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(bifp, m); +#endif + + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); + + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d packet for bridge\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, m); + + return EJUSTRETURN; + } + + /* + * if the destination of the packet is for the MAC address of + * the member interface itself, then we don't need to forward + * it -- just pass it back. Note that it'll likely just be + * dropped by the stack, but if something else is bound to + * the interface directly (for example, the wireless stats + * protocol -- although that actually uses BPF right now), + * then it will consume the packet + * + * ALSO, note that we do this check AFTER checking for the + * bridge's own MAC address, because the bridge may be + * using the SAME MAC address as one of its interfaces + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp), + ETHER_ADDR_LEN) == 0) { + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + if ((bif->bif_flags & IFBIF_PROXYSTA) == 0) { +#endif + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf("bridge_input: not forwarding packet bound for member interface\n" ); +#endif + lck_mtx_unlock(sc->sc_mtx); + return 0; + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + } +#if VERY_VERY_VERY_DIAGNOSTIC + else { + printf( "%s: pkt rx on %s [proxysta iface], da is %02x:%02x:%02x:%02x:%02x:%02x\n", + __func__, ifp->if_xname, eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5] ); + } +#endif +#endif + } + + if ((m->m_flags & (M_BCAST|M_MCAST))) { + struct ifmultiaddr *ifma = NULL; + + if ((m->m_flags & M_BCAST)) { + is_for_us = 1; + } else { +#if BRIDGE_DEBUG + printf("mulicast: %02x:%02x:%02x:%02x:%02x:%02x\n", + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5]); + + for (ifma = bifp->if_multiaddrs.lh_first; ifma; + ifma = ifma->ifma_link.le_next) { + + if (ifma->ifma_addr == NULL) + printf(" "); + else if (ifma->ifma_addr->sa_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)ifma->ifma_addr; + + printf(" %u.%u.%u.%u ", + (sin->sin_addr.s_addr & 0xff000000) >> 24, + (sin->sin_addr.s_addr & 0x00ff0000) >> 16, + (sin->sin_addr.s_addr & 0x0000ff00) >> 8, + (sin->sin_addr.s_addr & 0x000000ff)); + } + if (!ifma->ifma_ll || !ifma->ifma_ll->ifma_addr) + printf("\n"); + else { + struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifma->ifma_ll->ifma_addr; + + printf("%02x:%02x:%02x:%02x:%02x:%02x\n", + CONST_LLADDR(sdl)[0], CONST_LLADDR(sdl)[1], CONST_LLADDR(sdl)[2], + CONST_LLADDR(sdl)[3], CONST_LLADDR(sdl)[4], CONST_LLADDR(sdl)[5]); + + } + } +#endif /* BRIDGE_DEBUG */ + + /* + * the upper layer of the stack have attached a list of multicast addresses to the bridge itself + * (for example, the IP stack has bound 01:00:5e:00:00:01 to the 224.0.0.1 all hosts address), since + * the IP stack is bound to the bridge. so we need to see if the packets arriving here SHOULD be + * passed up as coming from the bridge. + * + * furthermore, since we know the IP stack is attached to the bridge, and NOTHING is attached + * to the underlying devices themselves, we can drop packets that don't need to go up (by returning NULL + * from bridge_input to the caller) after we forward the packet to other interfaces + */ + + for (ifma = bifp->if_multiaddrs.lh_first; ifma; + ifma = ifma->ifma_link.le_next) { + if (ifma->ifma_ll && ifma->ifma_ll->ifma_addr) { + struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifma->ifma_ll->ifma_addr; + + if (memcmp(eh->ether_dhost, CONST_LLADDR(sdl), ETHER_ADDR_LEN) == 0) + break; + } + } + if (ifma != NULL) { + /* this packet matches the bridge's own filter, so pass it up as coming from us */ + + /* Mark the packet as arriving on the bridge interface */ + // don't do this until AFTER we forward the packet -- bridge_forward uses this information + //m->m_pkthdr.rcvif = bifp; + + /* keep track of this to help us decide about forwarding */ + is_for_us = 1; + +#if BRIDGE_DEBUG + char addr[sizeof("XX:XX:XX:XX:XX:XX")+1]; + printf( "bridge_input: multicast frame for us (%s)\n", + ether_ntop(addr, sizeof(addr), eh->ether_dhost) ); +#endif + } else { +#if BRIDGE_DEBUG + char addr[sizeof("XX:XX:XX:XX:XX:XX")+1]; + printf( "bridge_input: multicast frame for unbound address (%s), forwarding but not passing to stack\n", + ether_ntop(addr, sizeof(addr), eh->ether_dhost) ); +#endif + } + } + /* Tap off 802.1D packets; they do not get forwarded. */ + if (memcmp(eh->ether_dhost, bstp_etheraddr, + ETHER_ADDR_LEN) == 0) { + m = bstp_input(sc, ifp, m); + if (m == NULL) { + lck_mtx_unlock(sc->sc_mtx); +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast BSTP not forwarded\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + return EJUSTRETURN; + } + } + + if (bif->bif_flags & IFBIF_STP) { + switch (bif->bif_state) { + case BSTP_IFSTATE_BLOCKING: + case BSTP_IFSTATE_LISTENING: + case BSTP_IFSTATE_DISABLED: + { + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast bridge not learning or forwarding \n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + m_freem(m); + return EJUSTRETURN; + } + } + } + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + ifp, 0, IFBAF_DYNAMIC); + } + + if (is_for_us) { + /* + * Make a deep copy of the packet and enqueue the copy + * for bridge processing; return the original packet for + * local processing. + */ + mc = m_dup(m, M_NOWAIT); + if (mc == NULL) { +#ifdef DIAGNOSTIC + printf( "bridge_input: failed to duplicate multicast frame, not forwarding\n" ); +#endif +#if BRIDGE_DEBUG + } else { + if (_if_brige_debug) { + printf_mbuf(mc, "mc for us: ", "\n"); + printf_mbuf_data(m, 0, 20); + printf("\n"); + } +#endif /* BRIDGE_DEBUG */ + } + } else { + /* + * we'll just pass the original, since we don't need to pass it + * up the stack + */ + mc = m; + } + + /* Perform the bridge forwarding function with the copy. */ + if (mc != NULL) { +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast forwarding \n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + bridge_forward(sc, mc); + } + + // TBD should have an option for type of bridge +#if 0 + /* + * Reinject the mbuf as arriving on the bridge so we have a + * chance at claiming multicast packets. We can not loop back + * here from ether_input as a bridge is never a member of a + * bridge. + */ + if (bifp->if_bridge != NULL) + panic("brige_input: brige %p in a bridge %p\n", bifp, bifp->if_bridge); + mc = m_dup(m, M_NOWAIT); + if (mc != NULL) { + mc->m_pkthdr.rcvif = bifp; +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(bifp, mc); +#endif + } +#endif + /* Return the original packet for local processing. */ + if ( !is_for_us ) + { + /* we don't free the packet -- bridge_forward already did so */ + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast local processing\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif + + return EJUSTRETURN; + } + + // mark packet as arriving on the bridge + m->m_pkthdr.rcvif = bifp; + m->m_pkthdr.header = mbuf_data(m); + +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(bifp, m); +#endif + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); + + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast for us\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, m); + + return EJUSTRETURN; + } + + if (bif->bif_flags & IFBIF_STP) { + switch (bif->bif_state) { + case BSTP_IFSTATE_BLOCKING: + case BSTP_IFSTATE_LISTENING: + case BSTP_IFSTATE_DISABLED: + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d ucast bridge not learning or forwarding \n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + m_freem(m); + return EJUSTRETURN; + } + } + + /* this code is not needed for Apple's bridge where the stack attaches directly */ +#if 1 /* TBD should be an option */ + /* + * Unicast. Make sure it's not for us. + */ + LIST_FOREACH(brm, &sc->sc_iflist, bif_next) { + if(ifnet_type(brm->bif_ifp) != IFT_ETHER) + continue; + + /* It is destined for us. */ + if (memcmp(ifnet_lladdr(brm->bif_ifp), eh->ether_dhost, + ETHER_ADDR_LEN) == 0) { + if (brm->bif_flags & IFBIF_LEARNING) + (void) bridge_rtupdate(sc, + eh->ether_shost, ifp, 0, IFBAF_DYNAMIC); + m->m_pkthdr.rcvif = brm->bif_ifp; + m->m_pkthdr.header = mbuf_data(m); + + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); +#if BRIDGE_SUPPORT_GIF +#if NGIF > 0 + if (ifnet_type(ifp) == IFT_GIF) { + m->m_flags |= M_PROTO1; + m->m_pkthdr.rcvif = brm->bif_ifp; + (*brm->bif_ifp->if_input)(brm->bif_ifp, m); + m = NULL; + } +#endif +#endif + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d ucast to member %s%d\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), + ifnet_name(brm->bif_ifp), ifnet_unit(brm->bif_ifp)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(brm->bif_ifp, m); + + return EJUSTRETURN; + } + + /* We just received a packet that we sent out. */ + if (memcmp(ifnet_lladdr(brm->bif_ifp), eh->ether_shost, + ETHER_ADDR_LEN) == 0) { + lck_mtx_unlock(sc->sc_mtx); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d ucast drop packet we sent out\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + m_freem(m); + return EJUSTRETURN; + } + } +#endif + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + ifp, 0, IFBAF_DYNAMIC); + } + + /* Perform the bridge forwarding function. */ +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d ucast forwarding\n", + ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if)); +#endif /* BRIDGE_DEBUG */ + + bridge_forward(sc, m); + lck_mtx_unlock(sc->sc_mtx); + return EJUSTRETURN; +} + +/* + * bridge_broadcast: + * + * Send a frame to all interfaces that are members of + * the bridge, except for the one on which the packet + * arrived. + */ +static void +bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, + struct mbuf *m, __unused int runfilt) +{ + struct bridge_iflist *bif; + struct mbuf *mc; + struct ifnet *dst_if; + int used = 0; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + if (dst_if == src_if) + continue; + + if (bif->bif_flags & IFBIF_STP) { + switch (bif->bif_state) { + case BSTP_IFSTATE_BLOCKING: + case BSTP_IFSTATE_DISABLED: + continue; + } + } + + if ((bif->bif_flags & IFBIF_DISCOVER) == 0 && + (m->m_flags & (M_BCAST|M_MCAST)) == 0) + continue; + + if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) + continue; + + if (LIST_NEXT(bif, bif_next) == NULL) { + mc = m; + used = 1; + } else { + mc = m_copym(m, 0, M_COPYALL, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1); + continue; + } + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); +} + +/* + * bridge_rtupdate: + * + * Add a bridge routing entry. + */ +static int +bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, + struct ifnet *dst_if, int setflags, uint8_t flags) +{ + struct bridge_rtnode *brt; + int error; + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + struct bridge_iflist *bif; + int is_pds; /* are we a proxy sta discovery interface? */ +#endif + struct timespec now; + + /* APPLE MODIFICATION - add support for Proxy STA - is this an interface + we want to do proxy sta discovery on? */ +#if IEEE80211_PROXYSTA + bif = bridge_lookup_member_if(sc, dst_if); + if ((bif) && (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER)) { + is_pds = 1; + } + else { + is_pds = 0; + } +#endif + /* + * A route for this destination might already exist. If so, + * update it, otherwise create a new one. + */ + if ((brt = bridge_rtnode_lookup(sc, dst)) == NULL) { + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + /* don't count this address against the bridge cache (well, allow proxy stas to double that + number...put *some* boundary on it.) if we are a proxy sta discovery interface */ + if (is_pds) { + if (sc->sc_brtcnt >= (sc->sc_brtmax+sc->sc_brtmax_proxysta)) + return (ENOSPC); + } + else +#endif + if (sc->sc_brtcnt >= sc->sc_brtmax) + return (ENOSPC); + + /* + * Allocate a new bridge forwarding node, and + * initialize the expiration time and Ethernet + * address. + */ + brt = zalloc_noblock(bridge_rtnode_pool); + if (brt == NULL) + return (ENOMEM); + + memset(brt, 0, sizeof(*brt)); + nanouptime(&now); + brt->brt_expire = now.tv_sec + sc->sc_brttimeout; + brt->brt_flags = IFBAF_DYNAMIC; + memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); + + /* APPLE MODIFICATION - add support for Proxy STA - is this an interface + we want to do proxy sta discovery on? If so, post a monitoring event */ +#if IEEE80211_PROXYSTA + if (is_pds) { + brt->brt_flags_ext |= IFBAF_EXT_PROXYSTA; +#if DIAGNOSTIC + printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; discovery\n", + __func__, dst[0], dst[1], dst[2], dst[3], dst[4], dst[5], dst_if->if_xname ); +#endif + bridge_proxysta_discover( dst_if, dst ); + } +#endif + + if ((error = bridge_rtnode_insert(sc, brt)) != 0) { + zfree(bridge_rtnode_pool, brt); + return (error); + } + } + + brt->brt_ifp = dst_if; + if (setflags) { + brt->brt_flags = flags; + brt->brt_expire = (flags & IFBAF_STATIC) ? 0 : + now.tv_sec + sc->sc_brttimeout; + } + + /* APPLE MODIFICATION - add support for Proxy STA - */ +#if IEEE80211_PROXYSTA + if (is_pds) { +#if VERY_VERY_DIAGNOSTIC + printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; reset timeout\n", + __func__, dst[0], dst[1], dst[2], dst[3], dst[4], dst[5], dst_if->if_xname ); +#endif + brt->brt_expire = (flags & IFBAF_STATIC) ? 0 : + now.tv_sec + sc->sc_brttimeout; + } +#endif + + return (0); +} + +/* + * bridge_rtlookup: + * + * Lookup the destination interface for an address. + */ +static struct ifnet * +bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr) +{ + struct bridge_rtnode *brt; + + if ((brt = bridge_rtnode_lookup(sc, addr)) == NULL) + return (NULL); + + return (brt->brt_ifp); +} + +/* + * bridge_rttrim: + * + * Trim the routine table so that we have a number + * of routing entries less than or equal to the + * maximum number. + */ +static void +bridge_rttrim(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + /* Make sure we actually need to do this. */ + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + /* Force an aging cycle; this might trim enough addresses. */ + bridge_rtage(sc); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + bridge_rtnode_destroy(sc, brt); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + } + } +} + +/* + * bridge_timer: + * + * Aging timer for the bridge. + */ +static void +bridge_timer(void *arg) +{ + struct bridge_softc *sc = arg; + struct timespec ts; + + lck_mtx_lock(sc->sc_mtx); + + bridge_rtage(sc); + + lck_mtx_unlock(sc->sc_mtx); + + if (ifnet_flags(sc->sc_if) & IFF_RUNNING) { + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + } +} + +/* + * bridge_rtage: + * + * Perform an aging cycle. + */ +static void +bridge_rtage(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + struct timespec now; + + nanouptime(&now); + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + if ((unsigned long)now.tv_sec >= brt->brt_expire) + bridge_rtnode_destroy(sc, brt); + } + } +} + +/* + * bridge_rtflush: + * + * Remove all dynamic addresses from the bridge. + */ +static void +bridge_rtflush(struct bridge_softc *sc, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + bridge_rtnode_destroy(sc, brt); + } +} + +/* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA +/* + * bridge_rtdiscovery: + * + */ +static void +bridge_rtdiscovery(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + struct bridge_iflist *bif; + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + bif = bridge_lookup_member_if(sc, brt->brt_ifp); + if ((bif) && (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) && + ((brt->brt_flags_ext & IFBAF_EXT_PROXYSTA) == 0)) { +#if DIAGNOSTIC + printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; found before IFBIF_PROXYSTA_DISCOVER\n", + __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], brt->brt_addr[3], + brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname ); +#endif + brt->brt_flags_ext |= IFBAF_EXT_PROXYSTA; + } + + if (brt->brt_ifp_proxysta == NULL) { +#if DIAGNOSTIC + printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; discovery\n", + __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], brt->brt_addr[3], + brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname ); +#endif + bridge_proxysta_discover( brt->brt_ifp, brt->brt_addr ); + } + } +} + +/* + * bridge_rtpurge: + * + * Remove all dynamic addresses from a specific interface on the bridge. + */ +static void +bridge_rtpurge(struct bridge_softc *sc, struct ifnet *ifs) +{ + struct bridge_rtnode *brt, *nbrt; + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + if (brt->brt_ifp == ifs) { +#if DIAGNOSTIC + printf( "%s: purge %s [%02x:%02x:%02x:%02x:%02x:%02x] discovered on %s\n", + __func__, brt->brt_ifp_proxysta ? brt->brt_ifp_proxysta->if_xname : brt->brt_ifp->if_xname, + brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], + brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname ); +#endif + bridge_rtnode_destroy(sc, brt); + } + } +} +#endif + +/* + * bridge_rtdaddr: + * + * Remove an address from the table. + */ +static int +bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr) +{ + struct bridge_rtnode *brt; + + if ((brt = bridge_rtnode_lookup(sc, addr)) == NULL) + return (ENOENT); + + bridge_rtnode_destroy(sc, brt); + return (0); +} + +/* + * bridge_rtdelete: + * + * Delete routes to a speicifc member interface. + */ +__private_extern__ void +bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) { + nbrt = LIST_NEXT(brt, brt_list); + if (brt->brt_ifp == ifp && (full || + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtable_init: + * + * Initialize the route table for this bridge. + */ +static int +bridge_rtable_init(struct bridge_softc *sc) +{ + int i; + + sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, + M_DEVBUF, M_WAITOK); + if (sc->sc_rthash == NULL) + return (ENOMEM); + + for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) + LIST_INIT(&sc->sc_rthash[i]); + + sc->sc_rthash_key = random(); + + LIST_INIT(&sc->sc_rtlist); + + return (0); +} + +/* + * bridge_rtable_fini: + * + * Deconstruct the route table for this bridge. + */ +static void +bridge_rtable_fini(struct bridge_softc *sc) +{ + + _FREE(sc->sc_rthash, M_DEVBUF); +} + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + */ +#define mix(a, b, c) \ +do { \ +a -= b; a -= c; a ^= (c >> 13); \ +b -= c; b -= a; b ^= (a << 8); \ +c -= a; c -= b; c ^= (b >> 13); \ +a -= b; a -= c; a ^= (c >> 12); \ +b -= c; b -= a; b ^= (a << 16); \ +c -= a; c -= b; c ^= (b >> 5); \ +a -= b; a -= c; a ^= (c >> 3); \ +b -= c; b -= a; b ^= (a << 10); \ +c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static uint32_t +bridge_rthash(__unused struct bridge_softc *sc, const uint8_t *addr) +{ + /* APPLE MODIFICATION - wasabi performance improvment - simplify the hash algorithm */ +#if 0 + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); + + return (c & BRIDGE_RTHASH_MASK); +#else + return addr[5]; +#endif +} + +#undef mix + +/* + * bridge_rtnode_lookup: + * + * Look up a bridge route node for the specified destination. + */ +static struct bridge_rtnode * +bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr) +{ + struct bridge_rtnode *brt; + uint32_t hash; + int dir; + + hash = bridge_rthash(sc, addr); + LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { + dir = memcmp(addr, brt->brt_addr, ETHER_ADDR_LEN); + if (dir == 0) + return (brt); + if (dir > 0) + return (NULL); + } + + return (NULL); +} + +/* + * bridge_rtnode_insert: + * + * Insert the specified bridge node into the route table. We + * assume the entry is not already in the table. + */ +static int +bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + struct bridge_rtnode *lbrt; + uint32_t hash; + int dir; + + hash = bridge_rthash(sc, brt->brt_addr); + + lbrt = LIST_FIRST(&sc->sc_rthash[hash]); + if (lbrt == NULL) { + LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); + goto out; + } + + do { + dir = memcmp(brt->brt_addr, lbrt->brt_addr, ETHER_ADDR_LEN); + if (dir == 0) + return (EEXIST); + if (dir > 0) { + LIST_INSERT_BEFORE(lbrt, brt, brt_hash); + goto out; + } + if (LIST_NEXT(lbrt, brt_hash) == NULL) { + LIST_INSERT_AFTER(lbrt, brt, brt_hash); + goto out; + } + lbrt = LIST_NEXT(lbrt, brt_hash); + } while (lbrt != NULL); + +#ifdef DIAGNOSTIC + panic("bridge_rtnode_insert: impossible"); +#endif + +out: + LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); + sc->sc_brtcnt++; + + return (0); +} + +/* + * bridge_rtnode_destroy: + * + * Destroy a bridge rtnode. + */ +static void +bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + /* APPLE MODIFICATION - add support for Proxy STA */ +#if IEEE80211_PROXYSTA + if (brt->brt_flags_ext & IFBAF_EXT_PROXYSTA) { +#if DIAGNOSTIC + printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x %s from %s; idle timeout\n", + __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], + brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5], + brt->brt_ifp_proxysta ? brt->brt_ifp_proxysta->if_xname : "unknown", + brt->brt_ifp->if_xname ); +#endif + bridge_proxysta_idle_timeout( brt->brt_ifp, brt->brt_addr ); + } +#endif + + LIST_REMOVE(brt, brt_hash); + + LIST_REMOVE(brt, brt_list); + sc->sc_brtcnt--; + zfree(bridge_rtnode_pool, brt); +} + +static errno_t +bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + //printf("bridge_set_bpf_tap ifp %p mode %d\n", ifp, mode); + + /* TBD locking */ + if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) { + return ENODEV; + } + + switch (mode) { + case BPF_TAP_DISABLE: + sc->sc_bpf_input = sc->sc_bpf_output = NULL; + break; + + case BPF_TAP_INPUT: + sc->sc_bpf_input = bpf_callback; + break; + + case BPF_TAP_OUTPUT: + sc->sc_bpf_output = bpf_callback; + break; + + case BPF_TAP_INPUT_OUTPUT: + sc->sc_bpf_input = sc->sc_bpf_output = bpf_callback; + break; + + default: + break; + } + + return 0; +} + +static void +bridge_detach(__unused ifnet_t ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + + lck_rw_lock_exclusive(bridge_list_lock); + LIST_REMOVE(sc, sc_list); + lck_rw_done(bridge_list_lock); + + ifnet_release(ifp); + + lck_mtx_free(sc->sc_mtx, bridge_lock_grp); + + _FREE(sc, M_DEVBUF); + return; +} + +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_input) { + if (mbuf_pkthdr_rcvif(m) != ifp) + printf("bridge_bpf_input rcvif: %p != ifp %p\n", mbuf_pkthdr_rcvif(m), ifp); + (*sc->sc_bpf_input)(ifp, m); + } + return 0; +} + +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_output) { + (*sc->sc_bpf_output)(ifp, m); + } + return 0; +} + diff --git a/bsd/net/if_bridgevar.h b/bsd/net/if_bridgevar.h new file mode 100644 index 000000000..6b47c922e --- /dev/null +++ b/bsd/net/if_bridgevar.h @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $apfw: if_bridgevar,v 1.7 2008/10/24 02:34:06 cbzimmer Exp $ */ +/* $NetBSD: if_bridgevar.h,v 1.8 2005/12/10 23:21:38 elad Exp $ */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + */ + +/* + * Data structure and control definitions for bridge interfaces. + */ + +#ifndef _NET_IF_BRIDGEVAR_H_ +#define _NET_IF_BRIDGEVAR_H_ + +#ifdef PRIVATE + +#include + +#include +#include + +/* + * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the + * bridge interface itself is keyed off the ifdrv structure. + */ +#define BRDGADD 0 /* add bridge member (ifbreq) */ +#define BRDGDEL 1 /* delete bridge member (ifbreq) */ +#define BRDGGIFFLGS 2 /* get member if flags (ifbreq) */ +#define BRDGSIFFLGS 3 /* set member if flags (ifbreq) */ +#define BRDGSCACHE 4 /* set cache size (ifbrparam) */ +#define BRDGGCACHE 5 /* get cache size (ifbrparam) */ +#define BRDGGIFS 6 /* get member list (ifbifconf) */ +#define BRDGRTS 7 /* get address list (ifbaconf) */ +#define BRDGSADDR 8 /* set static address (ifbareq) */ +#define BRDGSTO 9 /* set cache timeout (ifbrparam) */ +#define BRDGGTO 10 /* get cache timeout (ifbrparam) */ +#define BRDGDADDR 11 /* delete address (ifbareq) */ +#define BRDGFLUSH 12 /* flush address cache (ifbreq) */ + +#define BRDGGPRI 13 /* get priority (ifbrparam) */ +#define BRDGSPRI 14 /* set priority (ifbrparam) */ +#define BRDGGHT 15 /* get hello time (ifbrparam) */ +#define BRDGSHT 16 /* set hello time (ifbrparam) */ +#define BRDGGFD 17 /* get forward delay (ifbrparam) */ +#define BRDGSFD 18 /* set forward delay (ifbrparam) */ +#define BRDGGMA 19 /* get max age (ifbrparam) */ +#define BRDGSMA 20 /* set max age (ifbrparam) */ +#define BRDGSIFPRIO 21 /* set if priority (ifbreq) */ +#define BRDGSIFCOST 22 /* set if path cost (ifbreq) */ +#define BRDGGFILT 23 /* get filter flags (ifbrparam) */ +#define BRDGSFILT 24 /* set filter flags (ifbrparam) */ +#define BRDGPURGE 25 /* purge address cache for a particular interface (ifbreq) */ + +/* + * Generic bridge control request. + */ +#pragma pack(4) + +struct ifbreq { + char ifbr_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifbr_ifsflags; /* member if flags */ + uint16_t ifbr_portno; /* member if port number */ + uint8_t ifbr_state; /* member if STP state */ + uint8_t ifbr_priority; /* member if STP priority */ + uint8_t ifbr_path_cost; /* member if STP cost */ +}; + +#pragma pack() + +/* BRDGGIFFLAGS, BRDGSIFFLAGS */ +#define IFBIF_LEARNING 0x01 /* if can learn */ +#define IFBIF_DISCOVER 0x02 /* if sends packets w/ unknown dest. */ +#define IFBIF_STP 0x04 /* if participates in spanning tree */ +/* APPLE MODIFICATION + add the following bits for ProxySTA: + IFBIF_PROXYSTA, IFBIF_PROXYSTA_DISCOVER + add the following bits for Guest Network + IFBIF_NO_FORWARDING + */ +#define IFBIF_PROXYSTA 0x08 /* if interface is a proxy sta */ +#define IFBIF_PROXYSTA_DISCOVER 0x10 /* if interface is used to discover proxy sta candidates */ +#define IFBIF_NO_FORWARDING 0x20 /* if interface cannot forward traffic from one interface to the next */ + +/* APPLE MODIFICATION + add the following bits for ProxySTA: + PROXYSTA, PROXYSTA_DISCOVER + add the following bits for Guest Network + NO_FORWARDING + this was... + + #define IFBIFBITS "\020\1LEARNING\2DISCOVER\3STP" + */ +#define IFBIFBITS "\020\1LEARNING\2DISCOVER\3STP\4PROXYSTA\5PROXYSTA_DISCOVER\6NO_FORWARDING" + +/* BRDGFLUSH */ +#define IFBF_FLUSHDYN 0x00 /* flush learned addresses only */ +#define IFBF_FLUSHALL 0x01 /* flush all addresses */ + +/* BRDGSFILT */ +#define IFBF_FILT_USEIPF 0x00000001 /* run pfil hooks on the bridge +interface */ +#define IFBF_FILT_MEMBER 0x00000002 /* run pfil hooks on the member +interfaces */ +#define IFBF_FILT_ONLYIP 0x00000004 /* only pass IP[46] packets when +pfil is enabled */ +#define IFBF_FILT_MASK 0x00000007 /* mask of valid values */ + + +/* APPLE MODIFICATION : Default is to pass non-IP packets. */ +#define IFBF_FILT_DEFAULT ( IFBF_FILT_USEIPF | IFBF_FILT_MEMBER ) +#if 0 +#define IFBF_FILT_DEFAULT (IFBF_FILT_USEIPF | \ +IFBF_FILT_MEMBER | \ +IFBF_FILT_ONLYIP) +#endif + +/* STP port states */ +#define BSTP_IFSTATE_DISABLED 0 +#define BSTP_IFSTATE_LISTENING 1 +#define BSTP_IFSTATE_LEARNING 2 +#define BSTP_IFSTATE_FORWARDING 3 +#define BSTP_IFSTATE_BLOCKING 4 + +/* + * Interface list structure. + */ + +#pragma pack(4) + +struct ifbifconf { + uint32_t ifbic_len; /* buffer size */ + union { + caddr_t ifbicu_buf; + struct ifbreq *ifbicu_req; + } ifbic_ifbicu; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req +}; + +#ifdef KERNEL_PRIVATE +struct ifbifconf32 { + uint32_t ifbic_len; /* buffer size */ + union { + user32_addr_t ifbicu_buf; + user32_addr_t ifbicu_req; + } ifbic_ifbicu; +}; + +struct ifbifconf64 { + uint32_t ifbic_len; /* buffer size */ + union { + user64_addr_t ifbicu_buf; + user64_addr_t ifbicu_req; + } ifbic_ifbicu; +}; +#endif /* KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge address request. + */ + +#pragma pack(4) + +struct ifbareq { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + unsigned long ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ +}; + +#ifdef KERNEL_PRIVATE +struct ifbareq32 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ +}; + +struct ifbareq64 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint64_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ +}; +#endif /* KERNEL_PRIVATE */ + +#pragma pack() + +#define IFBAF_TYPEMASK 0x03 /* address type mask */ +#define IFBAF_DYNAMIC 0x00 /* dynamically learned address */ +#define IFBAF_STATIC 0x01 /* static address */ + +#define IFBAFBITS "\020\1STATIC" + +/* + * Address list structure. + */ + +#pragma pack(4) + +struct ifbaconf { + uint32_t ifbac_len; /* buffer size */ + union { + caddr_t ifbacu_buf; + struct ifbareq *ifbacu_req; + } ifbac_ifbacu; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req +}; + +#ifdef KERNEL_PRIVATE +struct ifbaconf32 { + uint32_t ifbac_len; /* buffer size */ + union { + user32_addr_t ifbacu_buf; + user32_addr_t ifbacu_req; + } ifbac_ifbacu; +}; + +struct ifbaconf64 { + uint32_t ifbac_len; /* buffer size */ + union { + user64_addr_t ifbacu_buf; + user64_addr_t ifbacu_req; + } ifbac_ifbacu; +}; +#endif /* KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge parameter structure. + */ + +#pragma pack(4) + +struct ifbrparam { + union { + uint32_t ifbrpu_int32; + uint16_t ifbrpu_int16; + uint8_t ifbrpu_int8; + } ifbrp_ifbrpu; +}; + +#pragma pack() + +#define ifbrp_csize ifbrp_ifbrpu.ifbrpu_int32 /* cache size */ +#define ifbrp_ctime ifbrp_ifbrpu.ifbrpu_int32 /* cache time (sec) */ +#define ifbrp_prio ifbrp_ifbrpu.ifbrpu_int16 /* bridge priority */ +#define ifbrp_hellotime ifbrp_ifbrpu.ifbrpu_int8 /* hello time (sec) */ +#define ifbrp_fwddelay ifbrp_ifbrpu.ifbrpu_int8 /* fwd time (sec) */ +#define ifbrp_maxage ifbrp_ifbrpu.ifbrpu_int8 /* max age (sec) */ +#define ifbrp_filter ifbrp_ifbrpu.ifbrpu_int32 /* filtering flags */ + +#ifdef KERNEL +/* + * Timekeeping structure used in spanning tree code. + */ +struct bridge_timer { + uint16_t active; + uint16_t value; +}; + +struct bstp_config_unit { + uint64_t cu_rootid; + uint64_t cu_bridge_id; + uint32_t cu_root_path_cost; + uint16_t cu_message_age; + uint16_t cu_max_age; + uint16_t cu_hello_time; + uint16_t cu_forward_delay; + uint16_t cu_port_id; + uint8_t cu_message_type; + uint8_t cu_topology_change_acknowledgment; + uint8_t cu_topology_change; +}; + +struct bstp_tcn_unit { + uint8_t tu_message_type; +}; + +struct bridge_softc; + +/* + * Bridge interface list entry. + * (VL) bridge_ifmember would be a better name, more descriptive + */ +struct bridge_iflist { + LIST_ENTRY(bridge_iflist) bif_next; + uint64_t bif_designated_root; + uint64_t bif_designated_bridge; + uint32_t bif_path_cost; + uint32_t bif_designated_cost; + struct bridge_timer bif_hold_timer; + struct bridge_timer bif_message_age_timer; + struct bridge_timer bif_forward_delay_timer; + uint16_t bif_port_id; + uint16_t bif_designated_port; + struct bstp_config_unit bif_config_bpdu; + uint8_t bif_state; + uint8_t bif_topology_change_acknowledge; + uint8_t bif_config_pending; + uint8_t bif_change_detection_enabled; + uint8_t bif_priority; + struct ifnet *bif_ifp; /* member if */ + uint32_t bif_flags; /* member if flags */ + int bif_mutecap; /* member muted caps */ + interface_filter_t bif_iff_ref; + struct bridge_softc *bif_sc; +}; + +/* + * Bridge route node. + */ +struct bridge_rtnode { + LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ + LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ + struct ifnet *brt_ifp; /* destination if */ + unsigned long brt_expire; /* expiration time */ + uint8_t brt_flags; /* address flags */ + uint8_t brt_addr[ETHER_ADDR_LEN]; + /* APPLE MODIFICATION - add the following elements: + brt_flags_ext, brt_ifp_proxysta */ +#define IFBAF_EXT_PROXYSTA 0x01 + uint8_t brt_flags_ext; /* extended flags */ + struct ifnet *brt_ifp_proxysta; /* proxy sta if */ +}; + + +/* + * Software state for each bridge. + */ +struct bridge_softc { + LIST_ENTRY(bridge_softc) sc_list; + struct ifnet *sc_if; + uint64_t sc_designated_root; + uint64_t sc_bridge_id; + struct bridge_iflist *sc_root_port; + uint32_t sc_root_path_cost; + uint16_t sc_max_age; + uint16_t sc_hello_time; + uint16_t sc_forward_delay; + uint16_t sc_bridge_max_age; + uint16_t sc_bridge_hello_time; + uint16_t sc_bridge_forward_delay; + uint16_t sc_topology_change_time; + uint16_t sc_hold_time; + uint16_t sc_bridge_priority; + uint8_t sc_topology_change_detected; + uint8_t sc_topology_change; + struct bridge_timer sc_hello_timer; + struct bridge_timer sc_topology_change_timer; + struct bridge_timer sc_tcn_timer; + uint32_t sc_brtmax; /* max # of addresses */ + uint32_t sc_brtcnt; /* cur. # of addresses */ + /* APPLE MODIFICATION - add the following elements: + sc_brtmax_proxysta */ + uint32_t sc_brtmax_proxysta; /* max # of proxy sta addresses */ + uint32_t sc_brttimeout; /* rt timeout in seconds */ + LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ + LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ + LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + uint32_t sc_filter_flags; /* ipf and flags */ + + //(VL) + char sc_if_xname[IFNAMSIZ]; + bpf_packet_func sc_bpf_input; + bpf_packet_func sc_bpf_output; + u_int32_t sc_flags; + lck_mtx_t *sc_mtx; +}; + +#define SCF_DETACHING 0x1 + +extern const uint8_t bstp_etheraddr[]; + +int bridgeattach(int); +void bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); +void bridge_rtdelete(struct bridge_softc *, struct ifnet *, int); + +void bstp_initialization(struct bridge_softc *); +void bstp_stop(struct bridge_softc *); +struct mbuf *bstp_input(struct bridge_softc *, struct ifnet *, struct mbuf *); + + +#endif /* KERNEL */ +#endif /* PRIVATE */ +#endif /* !_NET_IF_BRIDGEVAR_H_ */ + diff --git a/bsd/net/if_ethersubr.c b/bsd/net/if_ethersubr.c index 8d82c530d..e407e009f 100644 --- a/bsd/net/if_ethersubr.c +++ b/bsd/net/if_ethersubr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000, 2009 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,10 +95,6 @@ extern struct ifqueue pkintrq; #endif -#if BRIDGE -#include -#endif - /* #include "vlan.h" */ #if NVLAN > 0 #include diff --git a/bsd/net/if_llc.h b/bsd/net/if_llc.h index 7b0d446e2..dade70621 100644 --- a/bsd/net/if_llc.h +++ b/bsd/net/if_llc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000,2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,11 +37,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -57,7 +53,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)if_llc.h 8.1 (Berkeley) 6/10/93 + * @(#)if_llc.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NET_IF_LLC_H_ @@ -73,55 +69,67 @@ */ struct llc { - u_char llc_dsap; - u_char llc_ssap; + u_int8_t llc_dsap; + u_int8_t llc_ssap; union { struct { - u_char control; - u_char format_id; - u_char class_id; - u_char window_x2; + u_int8_t control; + u_int8_t format_id; + u_int8_t class_id; + u_int8_t window_x2; } type_u; struct { - u_char num_snd_x2; - u_char num_rcv_x2; + u_int8_t num_snd_x2; + u_int8_t num_rcv_x2; } type_i; struct { - u_char control; - u_char num_rcv_x2; + u_int8_t control; + u_int8_t num_rcv_x2; } type_s; struct { - u_char control; - struct frmrinfo { - u_char rej_pdu_0; - u_char rej_pdu_1; - u_char frmr_control; - u_char frmr_control_ext; - u_char frmr_cause; - } frmrinfo; + u_int8_t control; + /* + * We cannot put the following fields in a structure because + * the structure rounding might cause padding. + */ + u_int8_t frmr_rej_pdu0; + u_int8_t frmr_rej_pdu1; + u_int8_t frmr_control; + u_int8_t frmr_control_ext; + u_int8_t frmr_cause; } type_frmr; struct { - u_char control; - u_char org_code[3]; - u_short ether_type; - } type_snap; + u_int8_t control; + u_int8_t org_code[3]; + u_int16_t ether_type; + } type_snap __attribute__((__packed__)); struct { - u_char control; - u_char control_ext; + u_int8_t control; + u_int8_t control_ext; } type_raw; } llc_un; -}; -#define llc_control llc_un.type_u.control -#define llc_control_ext llc_un.type_raw.control_ext -#define llc_fid llc_un.type_u.format_id -#define llc_class llc_un.type_u.class_id -#define llc_window llc_un.type_u.window_x2 -#define llc_frmrinfo llc_un.type_frmr.frmrinfo -#define llc_frmr_pdu0 llc_un.type_frmr.frmrinfo.rej_pdu0 -#define llc_frmr_pdu1 llc_un.type_frmr.frmrinfo.rej_pdu1 -#define llc_frmr_control llc_un.type_frmr.frmrinfo.frmr_control -#define llc_frmr_control_ext llc_un.type_frmr.frmrinfo.frmr_control_ext -#define llc_frmr_cause llc_un.type_frmr.frmrinfo.frmr_control_ext +} __attribute__((__packed__)); + +struct frmrinfo { + u_int8_t frmr_rej_pdu0; + u_int8_t frmr_rej_pdu1; + u_int8_t frmr_control; + u_int8_t frmr_control_ext; + u_int8_t frmr_cause; +} __attribute__((__packed__)); + +#define llc_control llc_un.type_u.control +#define llc_control_ext llc_un.type_raw.control_ext +#define llc_fid llc_un.type_u.format_id +#define llc_class llc_un.type_u.class +#define llc_window llc_un.type_u.window_x2 +#define llc_frmrinfo llc_un.type_frmr.frmr_rej_pdu0 +#define llc_frmr_pdu0 llc_un.type_frmr.frmr_rej_pdu0 +#define llc_frmr_pdu1 llc_un.type_frmr.frmr_rej_pdu1 +#define llc_frmr_control llc_un.type_frmr.frmr_control +#define llc_frmr_control_ext llc_un.type_frmr.frmr_control_ext +#define llc_frmr_cause llc_un.type_frmr.frmr_cause +#define llc_snap llc_un.type_snap /* * Don't use sizeof(struct llc_un) for LLC header sizes @@ -129,6 +137,7 @@ struct llc { #define LLC_ISFRAMELEN 4 #define LLC_UFRAMELEN 3 #define LLC_FRMRLEN 7 +#define LLC_SNAPFRAMELEN 8 /* * Unnumbered LLC format commands @@ -165,8 +174,22 @@ struct llc { /* * ISO PDTR 10178 contains among others */ +#define LLC_8021D_LSAP 0x42 #define LLC_X25_LSAP 0x7e #define LLC_SNAP_LSAP 0xaa #define LLC_ISO_LSAP 0xfe -#endif +/* + * LLC XID definitions from 802.2, as needed + */ + +#define LLC_XID_FORMAT_BASIC 0x81 +#define LLC_XID_BASIC_MINLEN (LLC_UFRAMELEN + 3) + +#define LLC_XID_CLASS_I 0x1 +#define LLC_XID_CLASS_II 0x3 +#define LLC_XID_CLASS_III 0x5 +#define LLC_XID_CLASS_IV 0x7 + + +#endif /* !_NET_IF_LLC_H_ */ diff --git a/bsd/net/if_types.h b/bsd/net/if_types.h index a8b580130..4eced169b 100644 --- a/bsd/net/if_types.h +++ b/bsd/net/if_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -133,6 +133,7 @@ #define IFT_L2VLAN 0x87 /* Layer 2 Virtual LAN using 802.1Q */ #define IFT_IEEE8023ADLAG 0x88 /* IEEE802.3ad Link Aggregate */ #define IFT_IEEE1394 0x90 /* IEEE1394 High Performance SerialBus*/ +#define IFT_BRIDGE 0xd1 /* Transparent bridge interface */ /* * These are not based on IANA assignments: diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 67d52d0a2..0601b7872 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -487,6 +487,7 @@ struct ifnet { void *if_fwd_route_lock; #endif struct route if_fwd_route; /* cached IPv4 forwarding route */ + void *if_bridge; /* bridge glue */ }; #ifndef __APPLE__ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index e1be1efd0..8ebcfb841 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2009 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,8 +106,6 @@ #include #include -#define IF_MAXUNIT 0x7fff /* historical value */ - #define VLANNAME "vlan" typedef int (bpf_callback_func)(struct ifnet *, struct mbuf *); diff --git a/bsd/net/pf.c b/bsd/net/pf.c index cbc32f35d..5529d8056 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* $apfw: pf.c,v 1.37 2008/12/05 23:10:20 jhw Exp $ */ +/* $apfw: git commit 7c8016ea91f7b68950cf41729c92dd8e3e423ba7 $ */ /* $OpenBSD: pf.c,v 1.567 2008/02/20 23:40:13 henning Exp $ */ /* @@ -272,7 +272,7 @@ static int pf_test_state_tcp(struct pf_state **, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_udp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, - void *, struct pf_pdesc *); + void *, struct pf_pdesc *, u_short *); static int pf_test_state_icmp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); @@ -469,22 +469,32 @@ pf_state_lookup_aux(struct pf_state **state, struct pfi_kif *kif, #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all -#define STATE_INC_COUNTERS(s) \ - do { \ - s->rule.ptr->states++; \ - if (s->anchor.ptr != NULL) \ - s->anchor.ptr->states++; \ - if (s->nat_rule.ptr != NULL) \ - s->nat_rule.ptr->states++; \ +#define STATE_INC_COUNTERS(s) \ + do { \ + s->rule.ptr->states++; \ + VERIFY(s->rule.ptr->states != 0); \ + if (s->anchor.ptr != NULL) { \ + s->anchor.ptr->states++; \ + VERIFY(s->anchor.ptr->states != 0); \ + } \ + if (s->nat_rule.ptr != NULL) { \ + s->nat_rule.ptr->states++; \ + VERIFY(s->nat_rule.ptr->states != 0); \ + } \ } while (0) -#define STATE_DEC_COUNTERS(s) \ - do { \ - if (s->nat_rule.ptr != NULL) \ - s->nat_rule.ptr->states--; \ - if (s->anchor.ptr != NULL) \ - s->anchor.ptr->states--; \ - s->rule.ptr->states--; \ +#define STATE_DEC_COUNTERS(s) \ + do { \ + if (s->nat_rule.ptr != NULL) { \ + VERIFY(s->nat_rule.ptr->states > 0); \ + s->nat_rule.ptr->states--; \ + } \ + if (s->anchor.ptr != NULL) { \ + VERIFY(s->anchor.ptr->states > 0); \ + s->anchor.ptr->states--; \ + } \ + VERIFY(s->rule.ptr->states > 0); \ + s->rule.ptr->states--; \ } while (0) static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *); @@ -512,8 +522,8 @@ RB_GENERATE(pf_state_tree_id, pf_state, #define PF_DT_SKIP_EXTGWY 0x02 #ifndef NO_APPLE_EXTENSIONS -static const u_int16_t PF_PPTP_PORT = htons(1723); -static const u_int32_t PF_PPTP_MAGIC_NUMBER = htonl(0x1A2B3C4D); +static const u_int16_t PF_PPTP_PORT = 1723; +static const u_int32_t PF_PPTP_MAGIC_NUMBER = 0x1A2B3C4D; struct pf_pptp_hdr { u_int16_t length; @@ -762,7 +772,7 @@ struct pf_grev1_hdr { */ }; -static const u_int16_t PF_IKE_PORT = htons(500); +static const u_int16_t PF_IKE_PORT = 500; struct pf_ike_hdr { u_int64_t initiator_cookie, responder_cookie; @@ -1351,6 +1361,7 @@ pf_src_connlimit(struct pf_state **state) int bad = 0; (*state)->src_node->conn++; + VERIFY((*state)->src_node->conn != 0); (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); @@ -1612,6 +1623,7 @@ pf_insert_state(struct pfi_kif *kif, struct pf_state *s) TAILQ_INSERT_TAIL(&state_list, s, entry_list); pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; + VERIFY(pf_status.states != 0); pfi_kif_ref(kif, PFI_KIF_REF_STATE); #if NPFSYNC pfsync_insert_state(s); @@ -1751,8 +1763,11 @@ pf_src_tree_remove_state(struct pf_state *s) lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); if (s->src_node != NULL) { - if (s->src.tcp_est) + if (s->src.tcp_est) { + VERIFY(s->src_node->conn > 0); --s->src_node->conn; + } + VERIFY(s->src_node->states > 0); if (--s->src_node->states <= 0) { t = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!t) @@ -1761,6 +1776,7 @@ pf_src_tree_remove_state(struct pf_state *s) } } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { + VERIFY(s->nat_src_node->states > 0); if (--s->nat_src_node->states <= 0) { t = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!t) @@ -1819,16 +1835,21 @@ pf_free_state(struct pf_state *cur) return; #endif VERIFY(cur->timeout == PFTM_UNLINKED); + VERIFY(cur->rule.ptr->states > 0); if (--cur->rule.ptr->states <= 0 && cur->rule.ptr->src_nodes <= 0) pf_rm_rule(NULL, cur->rule.ptr); - if (cur->nat_rule.ptr != NULL) + if (cur->nat_rule.ptr != NULL) { + VERIFY(cur->nat_rule.ptr->states > 0); if (--cur->nat_rule.ptr->states <= 0 && cur->nat_rule.ptr->src_nodes <= 0) pf_rm_rule(NULL, cur->nat_rule.ptr); - if (cur->anchor.ptr != NULL) + } + if (cur->anchor.ptr != NULL) { + VERIFY(cur->anchor.ptr->states > 0); if (--cur->anchor.ptr->states <= 0) pf_rm_rule(NULL, cur->anchor.ptr); + } pf_normalize_tcp_cleanup(cur); pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE); TAILQ_REMOVE(&state_list, cur, entry_list); @@ -1836,6 +1857,7 @@ pf_free_state(struct pf_state *cur) pf_tag_unref(cur->tag); pool_put(&pf_state_pl, cur); pf_status.fcounters[FCNT_STATE_REMOVALS]++; + VERIFY(pf_status.states > 0); pf_status.states--; } @@ -3335,8 +3357,8 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, unsigned int cut; sa_family_t af = pd->af; u_int8_t proto = pd->proto; - unsigned int low = ntohs(r->rpool.proxy_port[0]); - unsigned int high = ntohs(r->rpool.proxy_port[1]); + unsigned int low = r->rpool.proxy_port[0]; + unsigned int high = r->rpool.proxy_port[1]; #else u_int16_t cut; #endif @@ -3358,7 +3380,7 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, if (proto == IPPROTO_UDP) { /*--- Never float IKE source port ---*/ - if (sxport->port == PF_IKE_PORT) { + if (ntohs(sxport->port) == PF_IKE_PORT) { nxport->port = sxport->port; return (0); } @@ -3387,9 +3409,30 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, return (0); } } + } else if (proto == IPPROTO_TCP) { + struct pf_state* s; + /* + * APPLE MODIFICATION: + * Fix allows....NAT to use a single binding for TCP session + * with same source IP and source port + */ + TAILQ_FOREACH(s, &state_list, entry_list) { + struct pf_state_key* sk = s->state_key; + if (!sk) + continue; + if (s->nat_rule.ptr != r) + continue; + if (sk->proto != IPPROTO_TCP || sk->af != af) + continue; + if (sk->lan.xport.port != sxport->port) + continue; + if (!(PF_AEQ(&sk->lan.addr, saddr, af))) + continue; + nxport->port = sk->gwy.xport.port; + return (0); + } } #endif - do { key.af = af; key.proto = proto; @@ -3411,7 +3454,6 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, #else key.ext.port = dport; #endif - /* * port search; start random, step; * similar 2 portloop in in_pcbbind @@ -3577,8 +3619,8 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, src->neg, kif)) r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : PF_SKIP_DST_ADDR].ptr; - else if (!pf_match_xport(r->proto, r->proto_variant, &src->xport, - sxport)) + else if (!pf_match_xport(r->proto, + r->proto_variant, &src->xport, sxport)) #else else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, src->neg, kif)) @@ -3945,12 +3987,42 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) case AF_INET: inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, 0, NULL); +#if INET6 + if (inp == NULL) { + struct in6_addr s6, d6; + + memset(&s6, 0, sizeof (s6)); + s6.s6_addr16[5] = htons(0xffff); + memcpy(&s6.s6_addr32[3], &saddr->v4, + sizeof (saddr->v4)); + + memset(&d6, 0, sizeof (d6)); + d6.s6_addr16[5] = htons(0xffff); + memcpy(&d6.s6_addr32[3], &daddr->v4, + sizeof (daddr->v4)); + + inp = in6_pcblookup_hash(pi, &s6, sport, + &d6, dport, 0, NULL); + if (inp == NULL) { + inp = in_pcblookup_hash(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); + if (inp == NULL) { + inp = in6_pcblookup_hash(pi, &s6, sport, + &d6, dport, INPLOOKUP_WILDCARD, + NULL); + if (inp == NULL) + return (-1); + } + } + } +#else if (inp == NULL) { inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); if (inp == NULL) return (-1); } +#endif /* !INET6 */ break; #endif /* INET */ #if INET6 @@ -4983,8 +5055,8 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, struct udphdr *uh = pd->hdr.udp; size_t plen = m->m_pkthdr.len - off - sizeof (*uh); - if (uh->uh_sport == PF_IKE_PORT && - uh->uh_dport == PF_IKE_PORT && + if (ntohs(uh->uh_sport) == PF_IKE_PORT && + ntohs(uh->uh_dport) == PF_IKE_PORT && plen >= PF_IKE_PACKET_MINSIZE) { if (plen > PF_IKE_PACKET_MINSIZE) plen = PF_IKE_PACKET_MINSIZE; @@ -5154,11 +5226,13 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, if (sn != NULL) { s->src_node = sn; s->src_node->states++; + VERIFY(s->src_node->states != 0); } if (nsn != NULL) { PF_ACPY(&nsn->raddr, &pd->naddr, af); s->nat_src_node = nsn; s->nat_src_node->states++; + VERIFY(s->nat_src_node->states != 0); } if (pd->proto == IPPROTO_TCP) { if ((pd->flags & PFDESC_TCP_NORM) && @@ -5195,8 +5269,8 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, sk->af = af; #ifndef NO_APPLE_EXTENSIONS if (pd->proto == IPPROTO_UDP) { - if (pd->hdr.udp->uh_sport == PF_IKE_PORT && - pd->hdr.udp->uh_dport == PF_IKE_PORT) { + if (ntohs(pd->hdr.udp->uh_sport) == PF_IKE_PORT && + ntohs(pd->hdr.udp->uh_dport) == PF_IKE_PORT) { sk->proto_variant = PF_EXTFILTER_APD; } else { sk->proto_variant = nr ? nr->extfilter : @@ -5323,7 +5397,8 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, u_int16_t dport = (direction == PF_OUT) ? sk->ext.xport.port : sk->gwy.xport.port; - if (nr != NULL && dport == PF_PPTP_PORT) { + if (nr != NULL && + ntohs(dport) == PF_PPTP_PORT) { struct pf_app_state *as; as = pool_get(&pf_app_state_pl, @@ -5349,8 +5424,9 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, case IPPROTO_UDP: { struct udphdr *uh = pd->hdr.udp; - if (nr != NULL && uh->uh_sport == PF_IKE_PORT && - uh->uh_dport == PF_IKE_PORT) { + if (nr != NULL && + ntohs(uh->uh_sport) == PF_IKE_PORT && + ntohs(uh->uh_dport) == PF_IKE_PORT) { struct pf_app_state *as; as = pool_get(&pf_app_state_pl, @@ -5614,9 +5690,9 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, as = &s->state_key->app_state->u.pptp; m_copydata(m, off, plen, &cm); - if (cm.hdr.magic != PF_PPTP_MAGIC_NUMBER) + if (ntohl(cm.hdr.magic) != PF_PPTP_MAGIC_NUMBER) return; - if (cm.hdr.type != htons(1)) + if (ntohs(cm.hdr.type) != 1) return; sk = s->state_key; @@ -5659,6 +5735,7 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, gsk->gwy.xport.call_id = 0; gsk->ext.xport.call_id = 0; + STATE_INC_COUNTERS(gs); as->grev1_state = gs; } else { gsk = gs->state_key; @@ -5816,8 +5893,12 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, } m = pf_lazy_makewritable(pd, m, off + plen); - if (!m) + if (!m) { + as->grev1_state = NULL; + STATE_DEC_COUNTERS(gs); + pool_put(&pf_state_pl, gs); return; + } m_copyback(m, off, plen, &cm); } @@ -5835,8 +5916,14 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, gs->creation = pf_time_second(); gs->expire = pf_time_second(); gs->timeout = PFTM_GREv1_FIRST_PACKET; - if (gs->src_node) ++gs->src_node->states; - if (gs->nat_src_node) ++gs->nat_src_node->states; + if (gs->src_node != NULL) { + ++gs->src_node->states; + VERIFY(gs->src_node->states != 0); + } + if (gs->nat_src_node != NULL) { + ++gs->nat_src_node->states; + VERIFY(gs->nat_src_node->states != 0); + } pf_set_rt_ifp(gs, &sk->lan.addr); if (pf_insert_state(BOUND_IFACE(s->rule.ptr, kif), gs)) { @@ -5851,7 +5938,8 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, * succeed. Failures are expected to be rare enough * that fixing this is a low priority. */ - + as->grev1_state = NULL; + pd->lmw = -1; pf_src_tree_remove_state(gs); STATE_DEC_COUNTERS(gs); pool_put(&pf_state_pl, gs); @@ -6105,9 +6193,27 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { +#ifndef NO_APPLE_MODIFICATION + /* + * + * + * Window scale negotiation has failed, + * therefore we must restore the window + * scale in the state record that we + * optimistically removed in + * pf_test_rule(). Care is required to + * prevent arithmetic overflow from + * zeroing the window when it's + * truncated down to 16-bits. --jhw + */ + u_int32_t _win = dst->max_win; + _win <<= dst->wscale & PF_WSCALE_MASK; + dst->max_win = MIN(0xffff, _win); +#else /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; +#endif /* in case of a retrans SYN|ACK */ dst->wscale = 0; } @@ -6125,9 +6231,16 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, * the crappy stack check or if we picked up the connection * after establishment) */ +#ifndef NO_APPLE_MODIFICATIONS + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, (u_int32_t)dst->max_win << dws), + src->seqhi)) + src->seqhi = end + MAX(1, (u_int32_t)dst->max_win << dws); +#else if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); +#endif if (win > src->max_win) src->max_win = win; @@ -6201,7 +6314,11 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ +#ifndef NO_APPLE_MODIFICATIONS + SEQ_GEQ(seq, src->seqlo - ((u_int32_t)dst->max_win << dws)) && +#else SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && +#endif /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ @@ -6229,9 +6346,13 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ +#ifndef NO_APPLE_MODIFICATIONS + if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi)) + dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1); +#else if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); - +#endif /* update states */ if (th->th_flags & TH_SYN) @@ -6331,8 +6452,13 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ +#ifndef NO_APPLE_MODIFICATIONS + if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi)) + dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1); +#else if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); +#endif /* * Cannot set dst->seqhi here since this could be a shotgunned @@ -6374,7 +6500,12 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', +#ifndef NO_APPLE_MODIFICATIONS + SEQ_GEQ(seq, + src->seqlo - ((u_int32_t)dst->max_win << dws)) ? +#else SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? +#endif ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', @@ -6447,7 +6578,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, static int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, - struct mbuf *m, int off, void *h, struct pf_pdesc *pd) + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { #pragma unused(h) struct pf_state_peer *src, *dst; @@ -6487,7 +6618,8 @@ pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, } #ifndef NO_APPLE_EXTENSIONS - if (uh->uh_sport == PF_IKE_PORT && uh->uh_dport == PF_IKE_PORT) { + if (ntohs(uh->uh_sport) == PF_IKE_PORT && + ntohs(uh->uh_dport) == PF_IKE_PORT) { struct pf_ike_hdr ike; size_t plen = m->m_pkthdr.len - off - sizeof (*uh); if (plen < PF_IKE_PACKET_MINSIZE) { @@ -6570,6 +6702,10 @@ pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, (*state)->state_key->app_state->handler) { (*state)->state_key->app_state->handler(*state, direction, off + uh->uh_ulen, pd, kif); + if (pd->lmw < 0) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } m = pd->mp; } #endif @@ -6968,7 +7104,12 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } if (!SEQ_GEQ(src->seqhi, seq) || +#ifndef NO_APPLE_MODIFICATION + !SEQ_GEQ(seq, + src->seqlo - ((u_int32_t)dst->max_win << dws))) { +#else !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) { +#endif if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, pd->hdr.icmp->icmp_code); @@ -7081,8 +7222,8 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, #ifndef NO_APPLE_EXTENSIONS key.proto_variant = PF_EXTFILTER_APD; - if (uh.uh_sport == PF_IKE_PORT && - uh.uh_dport == PF_IKE_PORT) { + if (ntohs(uh.uh_sport) == PF_IKE_PORT && + ntohs(uh.uh_dport) == PF_IKE_PORT) { struct pf_ike_hdr ike; size_t plen = m->m_pkthdr.len - off2 - sizeof (uh); @@ -8330,8 +8471,6 @@ pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, h = mtod(m, struct ip *); \ } \ } while (0) -#else -#define PF_APPLE_UPDATE_PDESC_IPv4() #endif int @@ -8439,9 +8578,13 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); - if (action == PF_DROP) +#ifndef NO_APPLE_EXTENSIONS + if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); +#endif + if (action == PF_DROP) + goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); #ifndef NO_APPLE_EXTENSIONS @@ -8478,7 +8621,8 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, REASON_SET(&reason, PFRES_SHORT); goto done; } - action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd, + &reason); #ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; @@ -8614,7 +8758,10 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, } done: +#ifndef NO_APPLE_EXTENSIONS + *m0 = pd.mp; PF_APPLE_UPDATE_PDESC_IPv4(); +#endif if (action == PF_PASS && h->ip_hl > 5 && !((s && s->allow_opts) || r->allow_opts)) { @@ -8732,8 +8879,15 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, } #ifndef NO_APPLE_EXTENSIONS + VERIFY(m == NULL || pd.mp == NULL || pd.mp == m); + if (*m0) { if (pd.lmw < 0) { + REASON_SET(&reason, PFRES_MEMORY); + action = PF_DROP; + } + + if (action == PF_DROP) { m_freem(*m0); *m0 = NULL; return (PF_DROP); @@ -8766,8 +8920,6 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, h = mtod(m, struct ip6_hdr *); \ } \ } while (0) -#else -#define PF_APPLE_UPDATE_PDESC_IPv6() #endif int @@ -8944,9 +9096,13 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, } pd.p_len = pd.tot_len - off - (th.th_off << 2); action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); - if (action == PF_DROP) +#ifndef NO_APPLE_EXTENSIONS + if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); +#endif + if (action == PF_DROP) + goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); #ifndef NO_APPLE_EXTENSIONS @@ -8983,7 +9139,8 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, REASON_SET(&reason, PFRES_SHORT); goto done; } - action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd, + &reason); #ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; @@ -9120,7 +9277,10 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, } done: +#ifndef NO_APPLE_EXTENSIONS + *m0 = pd.mp; PF_APPLE_UPDATE_PDESC_IPv6(); +#endif if (n != m) { m_freem(n); @@ -9246,8 +9406,15 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); #else #ifndef NO_APPLE_EXTENSIONS + VERIFY(m == NULL || pd.mp == NULL || pd.mp == m); + if (*m0) { if (pd.lmw < 0) { + REASON_SET(&reason, PFRES_MEMORY); + action = PF_DROP; + } + + if (action == PF_DROP) { m_freem(*m0); *m0 = NULL; return (PF_DROP); @@ -9411,6 +9578,15 @@ pf_time_second(void) { struct timeval t; + microuptime(&t); + return (t.tv_sec); +} + +uint64_t +pf_calendar_time_second(void) +{ + struct timeval t; + microtime(&t); return (t.tv_sec); } diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 8145fed94..5b8461e37 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -1329,7 +1329,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; } else { pf_status.running = 1; - pf_status.since = pf_time_second(); + pf_status.since = pf_calendar_time_second(); if (pf_status.stateid == 0) { pf_status.stateid = pf_time_second(); pf_status.stateid = pf_status.stateid << 32; @@ -1348,7 +1348,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) mbuf_growth_normal(); pf_detach_hooks(); pf_status.running = 0; - pf_status.since = pf_time_second(); + pf_status.since = pf_calendar_time_second(); wakeup(pf_purge_thread_fn); DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); } @@ -1922,6 +1922,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } pf_default_rule.states++; + VERIFY(pf_default_rule.states != 0); break; } @@ -2007,7 +2008,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) bzero(pf_status.counters, sizeof (pf_status.counters)); bzero(pf_status.fcounters, sizeof (pf_status.fcounters)); bzero(pf_status.scounters, sizeof (pf_status.scounters)); - pf_status.since = pf_time_second(); + pf_status.since = pf_calendar_time_second(); if (*pf_status.ifname) pfi_update_status(pf_status.ifname, NULL); break; diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index 60deece57..b8bdb0034 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -118,6 +118,7 @@ __private_extern__ void pool_sethardlimit(struct pool *, int, __private_extern__ void *pool_get(struct pool *, int); __private_extern__ void pool_put(struct pool *, void *); __private_extern__ u_int64_t pf_time_second(void); +__private_extern__ u_int64_t pf_calendar_time_second(void); #endif /* KERNEL */ union sockaddr_union { diff --git a/bsd/net/route.c b/bsd/net/route.c index 937341b00..df3b53ba3 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1208,6 +1208,8 @@ rtioctl(unsigned long req, caddr_t data, struct proc *p) #if INET && MROUTING return mrt_ioctl(req, data); #else +#pragma unused(req) +#pragma unused(data) return ENXIO; #endif } diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 6897e77ac..c553930f1 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -937,34 +938,64 @@ arp_ip_handle_input( struct llinfo_arp *llinfo; errno_t error; int created_announcement = 0; - + int bridged = 0, is_bridge = 0; + /* Do not respond to requests for 0.0.0.0 */ if (target_ip->sin_addr.s_addr == 0 && arpop == ARPOP_REQUEST) goto done; + + if (ifp->if_bridge) + bridged = 1; + if (ifp->if_type == IFT_BRIDGE) + is_bridge = 1; /* * Determine if this ARP is for us + * For a bridge, we want to check the address irrespective + * of the receive interface. */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { - /* do_bridge should be tested here for bridging */ - if (ia->ia_ifp == ifp && + if (((bridged && ia->ia_ifp->if_bridge != NULL) || + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + ifaref(&best_ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; } } TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) { - /* do_bridge should be tested here for bridging */ - if (ia->ia_ifp == ifp && + if (((bridged && ia->ia_ifp->if_bridge != NULL) || + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + ifaref(&best_ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; + } + } + +#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ + (ia->ia_ifp->if_bridge == ifp->if_softc && \ + !bcmp(ifnet_lladdr(ia->ia_ifp), ifnet_lladdr(ifp), ifp->if_addrlen) && \ + addr == ia->ia_addr.sin_addr.s_addr) + /* + * Check the case when bridge shares its MAC address with + * some of its children, so packets are claimed by bridge + * itself (bridge_input() does it first), but they are really + * meant to be destined to the bridge member. + */ + if (is_bridge) { + TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { + if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, ifp, ia)) { + ifp = ia->ia_ifp; + best_ia = ia; + ifaref(&best_ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; + } } } lck_rw_done(in_ifaddr_rwlock); @@ -980,12 +1011,16 @@ arp_ip_handle_input( continue; best_ia = (struct in_ifaddr *)ifa; ifaref(&best_ia->ia_ifa); - break; + ifnet_lock_done(ifp); + goto match; } ifnet_lock_done(ifp); - /* If we don't have an IP address on this interface, ignore the packet */ - if (best_ia == NULL) + /* + * If we're not a bridge member, or if we are but there's no + * IPv4 address to use for the interface, drop the packet. + */ + if (!bridged || best_ia == NULL) goto done; match: @@ -995,7 +1030,7 @@ arp_ip_handle_input( } /* Check for a conflict */ - if (sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) { + if (!bridged && sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) { struct kev_msg ev_msg; struct kev_in_collision *in_collision; u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN]; @@ -1152,7 +1187,7 @@ arp_ip_handle_input( RT_LOCK_ASSERT_HELD(route); gateway = SDL(route->rt_gateway); - if (route->rt_ifp != ifp) { + if (!bridged && route->rt_ifp != ifp) { if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) || (ifp->if_eflags & IFEF_ARPLL) == 0) { if (log_arp_warnings) log(LOG_ERR, "arp: %s is on %s%d but got reply from %s on %s%d\n", @@ -1286,6 +1321,19 @@ arp_ip_handle_input( if (error == 0) { RT_LOCK_ASSERT_HELD(route); + /* + * Return proxied ARP replies only on the interface + * or bridge cluster where this network resides. + * Otherwise we may conflict with the host we are + * proxying for. + */ + if (route->rt_ifp != ifp && + (route->rt_ifp->if_bridge != ifp->if_bridge || + ifp->if_bridge == NULL)) { + RT_REMREF_LOCKED(route); + RT_UNLOCK(route); + goto done; + } proxied = *SDL(route->rt_gateway); target_hw = &proxied; } else { diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 090c692bc..54fceaef4 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,11 +100,6 @@ #include #include -#if BRIDGE -#include /* for struct arpcom */ -#include -#endif - /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timer.c) @@ -1155,28 +1150,6 @@ dummynet_send(struct mbuf *m) proto_inject(PF_INET, m); break ; -#if BRIDGE - case DN_TO_BDG_FWD : - /* - * The bridge requires/assumes the Ethernet header is - * contiguous in the first mbuf header. Insure this is true. - */ - if (BDG_LOADED) { - if (m->m_len < ETHER_HDR_LEN && - (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/bridge: pullup fail, dropping pkt\n"); - break; - } - m = bdg_forward_ptr(m, pkt->ifp); - } else { - /* somebody unloaded the bridge module. Drop pkt */ - /* XXX rate limit */ - printf("dummynet: dropping bridged packet trapped in pipe\n"); - } - if (m) - m_freem(m); - break; -#endif default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(m); diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index eaf005f60..a989e64e3 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -2633,7 +2633,6 @@ ip_setmoptions(sopt, imop) struct ip_moptions **imop; { int error = 0; - int i; struct in_addr addr; struct ip_mreq mreq; struct ifnet *ifp = NULL; @@ -2654,20 +2653,23 @@ ip_setmoptions(sopt, imop) switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ #if MROUTING - case IP_MULTICAST_VIF: - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; + case IP_MULTICAST_VIF: + { + int i; + if (legal_vif_num == 0) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + break; + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; break; } - imo->imo_multicast_vif = i; - break; #endif /* MROUTING */ case IP_MULTICAST_IF: diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index e7fda107f..a293cc24a 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1007,6 +1007,10 @@ tcp_input(m, off0) goto drop; #endif + /* Radar 7377561: Avoid processing packets while closing a listen socket */ + if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0) + goto drop; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { #if TCPDEBUG if (so->so_options & SO_DEBUG) { @@ -1296,7 +1300,6 @@ tcp_input(m, off0) KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } } - #if 1 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); #endif diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index 5fc80e330..a9fd82b98 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -528,14 +528,14 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, getmicrotime(&timenow); privileged = (proc_suser(p) == 0); - +#if MROUTING switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6_32: case SIOCGETMIFCNT_IN6_64: return (mrt6_ioctl(cmd, data)); } - +#endif if (ifp == NULL) return (EOPNOTSUPP); @@ -724,20 +724,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCPROTOATTACH_IN6_32: case SIOCPROTOATTACH_IN6_64: - switch (ifp->if_type) { -#if IFT_BRIDGE /*OpenBSD 2.8*/ - /* some of the interfaces are inherently not IPv6 capable */ - case IFT_BRIDGE: - return; - /* NOTREACHED */ -#endif - default: - if ((error = proto_plumb(PF_INET6, ifp))) - printf("SIOCPROTOATTACH_IN6: %s " - "error=%d\n", if_name(ifp), error); - break; - - } + if ((error = proto_plumb(PF_INET6, ifp))) + printf("SIOCPROTOATTACH_IN6: %s " + "error=%d\n", if_name(ifp), error); return (error); /* NOTREACHED */ diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index dff06569f..5995b212d 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -307,6 +307,7 @@ get_hw_ifid( #if IFT_IEEE80211 case IFT_IEEE80211: #endif + case IFT_BRIDGE: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index a197c6c6b..d2621dd30 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -312,6 +312,7 @@ struct ip6protosw inet6sw[] = { 0, rip_unlock, 0, { 0, 0 }, NULL, { 0 } }, +#if MROUTING { SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR, pim6_input, rip6_pr_output, 0, rip6_ctloutput, 0, @@ -321,6 +322,17 @@ struct ip6protosw inet6sw[] = { 0, rip_unlock, 0, { 0, 0 }, NULL, { 0 } }, +#else +{ SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + 0, 0, 0, rip6_ctloutput, + 0, + 0, 0, 0, 0, + 0, + &rip6_usrreqs, + 0, rip_unlock, 0, + { 0, 0 }, NULL, { 0 } +}, +#endif /* raw wildcard */ { SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR|PR_LASTHDR, rip6_input, rip6_pr_output, 0, rip6_ctloutput, @@ -548,8 +560,10 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, ""); SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, &rip6stat, rip6stat, ""); +#if MROUTING SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD, &mrt6stat, mrt6stat, ""); +#endif SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH, neighborgcthresh, CTLFLAG_RW, &ip6_neighborgcthresh, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES, diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 1c19434ab..cdf3776b2 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -637,7 +637,11 @@ ip6_input(m) ifnet_lock_done(ifp); if (in6m) ours = 1; +#if MROUTING else if (!ip6_mrouter) { +#else + else { +#endif ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(ifp, ifs6_in_discard); @@ -902,12 +906,14 @@ ip6_input(m) * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ +#if MROUTING if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; m_freem(m); lck_mtx_unlock(ip6_mutex); return; } +#endif if (!ours) { m_freem(m); lck_mtx_unlock(ip6_mutex); diff --git a/bsd/netinet6/ip6_mroute.c b/bsd/netinet6/ip6_mroute.c index 3f0735c63..da8c4fc96 100644 --- a/bsd/netinet6/ip6_mroute.c +++ b/bsd/netinet6/ip6_mroute.c @@ -135,6 +135,9 @@ extern lck_mtx_t *ip6_mutex; struct socket *ip6_mrouter = NULL; int ip6_mrouter_ver = 0; int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */ + +#if MROUTING + struct mrt6stat mrt6stat; #define NO_RTE_FOUND 0x1 @@ -1905,3 +1908,4 @@ pim6_input(mp, offp) rip6_input(&m, offp); return(IPPROTO_DONE); } +#endif diff --git a/bsd/netinet6/ip6_mroute.h b/bsd/netinet6/ip6_mroute.h index dd50d46bd..5eef448db 100644 --- a/bsd/netinet6/ip6_mroute.h +++ b/bsd/netinet6/ip6_mroute.h @@ -313,6 +313,7 @@ struct rtdetq { /* XXX: rtdetq is also defined in ip_mroute.h */ }; #endif /* _NETINET_IP_MROUTE_H_ */ +#if MROUTING #ifdef KERNEL_PRIVATE extern struct mrt6stat mrt6stat; @@ -322,5 +323,6 @@ extern int ip6_mrouter_done(void); extern int mrt6_ioctl(u_long, caddr_t); #endif /* KERNEL_PRIVATE */ #endif /* PRIVATE */ +#endif #endif /* !_NETINET6_IP6_MROUTE_H_ */ diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index e426933c6..39c0d4602 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -867,12 +867,14 @@ skip_ipsec2:; * above, will be forwarded by the ip6_input() routine, * if necessary. */ +#if MROUTING if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } +#endif } /* * Multicasts with a hoplimit of zero may be looped back, diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index a895cad31..c5b5bca17 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -300,8 +300,9 @@ extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */ extern int ip6_maxifprefixes; /* Max acceptable prefixes via RA per IF */ extern int ip6_maxifdefrouters; /* Max acceptable def routers via RA */ extern int ip6_maxdynroutes; /* Max # of routes created via redirect */ - +#ifdef MROUTING extern struct socket *ip6_mrouter; /* multicast routing daemon */ +#endif extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index b65d9a5ef..6a7da3d2b 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -3384,6 +3384,7 @@ ipsec6_output_tunnel( struct ip *ip; struct sockaddr_in* dst4; struct route *ro4 = NULL; + struct ip_out_args ipoa = { IFSCOPE_NONE }; /* * must be last isr because encapsulated IPv6 packet @@ -3418,14 +3419,7 @@ ipsec6_output_tunnel( dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; - rtalloc(ro4); } - if (ro4->ro_rt == NULL) { - OSAddAtomic(1, &ipstat.ips_noroute); - error = EHOSTUNREACH; - goto bad; - } - state->m = ipsec4_splithdr(state->m); if (!state->m) { error = ENOMEM; @@ -3474,8 +3468,10 @@ ipsec6_output_tunnel( } ip = mtod(state->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ - ip_output(state->m, NULL, ro4, 0, NULL, NULL); + error = ip_output(state->m, NULL, ro4, IP_OUTARGS, NULL, &ipoa); state->m = NULL; + if (error != 0) + goto bad; goto done; } else { ipseclog((LOG_ERR, "ipsec6_output_tunnel: " @@ -4132,6 +4128,7 @@ ipsec_send_natt_keepalive( struct udphdr *uh; struct ip *ip; int error; + struct ip_out_args ipoa = { IFSCOPE_NONE }; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -4172,7 +4169,7 @@ ipsec_send_natt_keepalive( uh->uh_sum = 0; *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; - error = ip_output(m, NULL, &sav->sah->sa_route, IP_NOIPSEC, NULL, NULL); + error = ip_output(m, NULL, &sav->sah->sa_route, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa); if (error == 0) { sav->natt_last_activity = natt_now; return TRUE; diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 36e09c8b8..7e9a882e8 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -499,7 +499,11 @@ mld6_sendpkt( * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ +#if MROUTING im6o.im6o_multicast_loop = (ip6_mrouter != NULL); +#else + im6o.im6o_multicast_loop = 0; +#endif /* increment output statictics */ icmp6stat.icp6s_outhist[type]++; diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 5f2c2abcd..a7b5cb3a3 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1309,7 +1309,7 @@ nd6_free( dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))-> sin6_addr, rt->rt_ifp); - if (ln->ln_router || dr) { + if ((ln && ln->ln_router) || dr) { /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. @@ -2906,6 +2906,7 @@ nd6_need_cache( #if IFT_IEEE80211 case IFT_IEEE80211: #endif + case IFT_BRIDGE: case IFT_GIF: /* XXX need more cases? */ return(1); default: @@ -2933,6 +2934,7 @@ nd6_storelladdr( #if IFT_IEEE80211 case IFT_IEEE80211: #endif + case IFT_BRIDGE: ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr, desten); return(1); diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 762258e45..d4a32b927 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -566,8 +566,10 @@ rip6_ctloutput( case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: +#if MROUTING error = ip6_mrouter_get(so, sopt); break; +#endif default: error = ip6_ctloutput(so, sopt); break; @@ -597,8 +599,10 @@ rip6_ctloutput( case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: +#if MROUTING error = ip6_mrouter_set(so, sopt); break; +#endif default: error = ip6_ctloutput(so, sopt); break; @@ -649,8 +653,10 @@ rip6_detach(struct socket *so) if (inp == 0) panic("rip6_detach"); /* xxx: RSVP */ +#if MROUTING if (so == ip6_mrouter) ip6_mrouter_done(); +#endif if (inp->in6p_icmp6filt) { FREE(inp->in6p_icmp6filt, M_PCB); inp->in6p_icmp6filt = NULL; diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index e630eb95c..ee1a61679 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -3952,10 +3952,10 @@ nfsrv_rcv_locked(socket_t so, struct nfsrv_sock *slp, int waitflag) if (slp->ns_sotype == SOCK_STREAM) { /* * If there are already records on the queue, defer soreceive() - * to an nfsd so that there is feedback to the TCP layer that + * to an(other) nfsd so that there is feedback to the TCP layer that * the nfs servers are heavily loaded. */ - if (slp->ns_rec && waitflag == MBUF_DONTWAIT) { + if (slp->ns_rec) { ns_flag = SLP_NEEDQ; goto dorecs; } diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index a11222c07..6d3801a50 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -147,6 +147,7 @@ struct buf { * Parameters for buffer cache garbage collection */ #define BUF_STALE_THRESHHOLD 30 /* Collect if untouched in the last 30 seconds */ +#define BUF_MAX_GC_COUNT 1000 /* Generally 6-8 MB */ /* * mask used by buf_flags... these are the readable external flags diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 3abe336cc..92f687f3e 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -66,6 +66,10 @@ enum { typedef struct jetsam_priority_entry { pid_t pid; uint32_t flags; + int32_t hiwat_pages; + int32_t hiwat_reserved1; + int32_t hiwat_reserved2; + int32_t hiwat_reserved3; } jetsam_priority_entry_t; /* diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index d5a3d3272..57740c51f 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -292,6 +292,9 @@ struct vfs_attr { * NFS export related mount flags. */ #define MNT_EXPORTED 0x00000100 /* file system is exported */ +#ifdef PRIVATE +#define MNT_IMGSRC 0x00000200 +#endif /* CONFIG_IMGSRC_ACCESS */ /* * MAC labeled / "quarantined" flag diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 6cca245d5..cb71406df 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -227,6 +227,10 @@ extern struct mount * dead_mountp; * because the bits here were broken out from the high bits * of the mount flags. */ +#ifdef CONFIG_IMGSRC_ACCESS +#define MNTK_HAS_MOVED 0x00002000 +#define MNTK_BACKS_ROOT 0x00004000 +#endif /* CONFIG_IMGSRC_ACCESS */ #define MNTK_AUTH_CACHE_TTL 0x00008000 /* rights cache has TTL - TTL of 0 disables cache */ #define MNTK_PATH_FROM_ID 0x00010000 /* mounted file system supports id-to-path lookups */ #define MNTK_UNMOUNT_PREFLIGHT 0x00020000 /* mounted file system wants preflight check during unmount */ diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index dc68c04c2..7d0cfae29 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -42,7 +42,6 @@ struct threadlist { TAILQ_ENTRY(threadlist) th_entry; thread_t th_thread; int th_flags; - uint32_t th_suspended; uint16_t th_affinity_tag; uint8_t th_priority; uint8_t th_policy; @@ -57,6 +56,7 @@ struct threadlist { #define TH_LIST_BLOCKED 0x04 #define TH_LIST_SUSPENDED 0x08 #define TH_LIST_BUSY 0x10 +#define TH_LIST_NEED_WAKEUP 0x20 struct workitem { TAILQ_ENTRY(workitem) wi_entry; diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 85829a914..72c969c12 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -106,6 +106,7 @@ typedef __uint64_t rlim_t; #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define PRIO_DARWIN_THREAD 3 /* Second argument is always 0 (current thread) */ +#define PRIO_DARWIN_PROCESS 4 /* Second argument is a PID */ /* * Range limitations for the value of the third parameter to setpriority(). @@ -113,7 +114,8 @@ typedef __uint64_t rlim_t; #define PRIO_MIN -20 #define PRIO_MAX 20 -/* use PRIO_DARWIN_BG to set the current thread into "background" state +/* + * use PRIO_DARWIN_BG to set the current thread into "background" state * which lowers CPU, disk IO, and networking priorites until thread terminates * or "background" state is revoked */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 2bd0c593e..830dc76ae 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,7 +120,7 @@ struct socket { short so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ short so_state; /* internal state flags SS_*, below */ - caddr_t so_pcb; /* protocol control block */ + void *so_pcb; /* protocol control block */ struct protosw *so_proto; /* protocol handle */ /* * Variables for connection queueing. diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 4a7700e0b..ab6ca6658 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -160,6 +160,22 @@ #define SIOCGIFBOND _IOWR('i', 71, struct ifreq) /* get bond if config */ #define SIOCIFCREATE _IOWR('i', 120, struct ifreq) /* create clone if */ #define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */ + +#define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific + parameters */ +#define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific + parameters */ +#ifdef KERNEL_PRIVATE +#define SIOCSDRVSPEC32 _IOW('i', 123, struct ifdrv32) /* set driver-specific + parameters */ +#define SIOCGDRVSPEC32 _IOWR('i', 123, struct ifdrv32) /* get driver-specific + parameters */ +#define SIOCSDRVSPEC64 _IOW('i', 123, struct ifdrv64) /* set driver-specific + parameters */ +#define SIOCGDRVSPEC64 _IOWR('i', 123, struct ifdrv64) /* get driver-specific + parameters */ + +#endif /* KERNEL_PRIVATE */ #define SIOCSIFVLAN _IOW('i', 126, struct ifreq) /* set VLAN config */ #define SIOCGIFVLAN _IOWR('i', 127, struct ifreq) /* get VLAN config */ #define SIOCSETVLAN SIOCSIFVLAN diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index 7c73b04ee..775a8457b 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -58,8 +58,11 @@ extern struct zone *ubc_info_zone; +/* + * Maximum number of vfs clusters per vnode + */ +#define MAX_CLUSTERS CONFIG_MAX_CLUSTERS -#define MAX_CLUSTERS 8 /* maximum number of vfs clusters per vnode */ #define SPARSE_PUSH_LIMIT 4 /* limit on number of concurrent sparse pushes outside of the cl_lockw */ /* once we reach this limit, we'll hold the lock */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index cf8f7b455..dbff3a50d 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -250,6 +250,10 @@ struct vnode { */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ +#ifdef CONFIG_IMGSRC_ACCESS +extern struct vnode *imgsrc_rootvnode; +#endif /* CONFIG_IMGSRC_ACCESS */ + /* * Mods for exensibility. diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 3f4c4e593..6d3eba5eb 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -125,7 +125,7 @@ static void buf_reassign(buf_t bp, vnode_t newvp); static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); -static boolean_t buffer_cache_gc(void); +boolean_t buffer_cache_gc(void); __private_extern__ int bdwrite_internal(buf_t, int); @@ -3648,12 +3648,13 @@ brecover_data(buf_t bp) return(0); } -static boolean_t +boolean_t buffer_cache_gc(void) { buf_t bp; boolean_t did_large_zfree = FALSE; int now = buf_timestamp(); + uint32_t count = 0; lck_mtx_lock_spin(buf_mtxp); @@ -3661,7 +3662,7 @@ buffer_cache_gc(void) bp = TAILQ_FIRST(&bufqueues[BQ_META]); /* Only collect buffers unused in the last N seconds. Note: ordered by timestamp. */ - while ((bp != NULL) && ((now - bp->b_timestamp) > BUF_STALE_THRESHHOLD)) { + while ((bp != NULL) && ((now - bp->b_timestamp) > BUF_STALE_THRESHHOLD) && (count < BUF_MAX_GC_COUNT)) { int result, size; boolean_t is_zalloc; @@ -3674,6 +3675,7 @@ buffer_cache_gc(void) did_large_zfree = TRUE; } bp = TAILQ_FIRST(&bufqueues[BQ_META]); + count++; } lck_mtx_unlock(buf_mtxp); diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 5aec1498a..d436d781b 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -2718,7 +2718,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ if ((start_offset + total_size) > max_io_size) - total_size -= start_offset; + total_size = max_io_size - start_offset; xfer_resid = total_size; retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); @@ -5614,6 +5614,14 @@ is_file_clean(vnode_t vp, off_t filesize) #define DRT_HASH_SMALL_MODULUS 23 #define DRT_HASH_LARGE_MODULUS 401 +/* + * Physical memory required before the large hash modulus is permitted. + * + * On small memory systems, the large hash modulus can lead to phsyical + * memory starvation, so we avoid using it there. + */ +#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ + #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ @@ -5756,8 +5764,12 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) * see whether we should grow to the large one. */ if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { - /* if the ring is nearly full */ - if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) { + /* + * If the ring is nearly full and we are allowed to + * use the large modulus, upgrade. + */ + if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && + (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { nsize = DRT_HASH_LARGE_MODULUS; } else { nsize = DRT_HASH_SMALL_MODULUS; diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 467eb00b2..529129d9c 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -84,6 +84,11 @@ */ struct mount *rootfs; struct vnode *rootvnode; + +#ifdef CONFIG_IMGSRC_ACCESS +struct vnode *imgsrc_rootvnode; +#endif /* IMGSRC_ACESS */ + int (*mountroot)(void) = NULL; /* diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index c44732bb8..d78894caf 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -6901,8 +6901,6 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * } while (!eofflag); /* * If we've made it here all the files in the dir are ._ files. - * As we iterate through to delete them, we will verify that - * they are true AppleDouble files. * We can delete the files even though the node is suspended * because we are the owner of the file. */ @@ -6943,61 +6941,12 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) ) { - /* - * This is a ._ file, so verify it is actually an AppleDouble - * file by checking the header before we remove it. - */ - vnode_t xvp = NULL; - int did_namei = 0; - - NDINIT(&nd_temp, DELETE, USEDVP | LOCKPARENT, - UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); + NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); nd_temp.ni_dvp = vp; - error = namei(&nd_temp); - - if (error) { - if (error == ENOENT) { - error = 0; - } else { - error = ENOTEMPTY; - } - goto out1; - } - did_namei = 1; - - xvp = nd_temp.ni_vp; - - error = check_appledouble_header(xvp, ctx); - if (error) { - error = ENOTEMPTY; - goto out1; - } - - /* Remove the file. */ - error = VNOP_REMOVE(vp, xvp, &nd_temp.ni_cnd, 0, ctx); - if (error) { - if (error == ENOENT) { - error = 0; - } - goto out1; - } - -out1: - /* drop extra reference on vp from LOCKPARENT namei */ - vnode_put (vp); - - if (did_namei) { - nameidone(&nd_temp); - did_namei = 0; - } - if (xvp) { - vnode_put(xvp); - xvp = NULL; - } - if (error) { + error = unlink1(ctx, &nd_temp, 0); + if (error && error != ENOENT) { goto outsc; } - } cpos += dp->d_reclen; dp = (struct dirent*)cpos; diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 23653799f..24dfd95b3 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -153,6 +153,17 @@ static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp); static int fsync_common(proc_t p, struct fsync_args *uap, int flags); + +#ifdef CONFIG_IMGSRC_ACCESS +static int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname); +static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx); +static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx); +static void undo_place_on_covered_vp(mount_t mp, vnode_t vp); +static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags); +static void mount_end_update(mount_t mp); +static int relocate_imageboot_source(vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs); +#endif /* CONFIG_IMGSRC_ACCESS */ + int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); __private_extern__ @@ -297,6 +308,15 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 if (error) goto out1; +#ifdef CONFIG_IMGSRC_ACCESS + if (uap->flags == MNT_IMGSRC) { + error = relocate_imageboot_source(vp, &nd.ni_cnd, fstypename, ctx, is_64bit, fsmountargs); + vnode_put(pvp); + vnode_put(vp); + return error; + } +#endif /* CONFIG_IMGSRC_ACCESS */ + if (uap->flags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { error = EINVAL; @@ -323,6 +343,17 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 error = ENOTSUP; goto out1; } + +#ifdef CONFIG_IMGSRC_ACCESS + /* Can't downgrade the backer of the root FS */ + if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && + (!vfs_isrdonly(mp)) && (uap->flags & MNT_RDONLY)) + { + error = ENOTSUP; + goto out1; + } +#endif /* CONFIG_IMGSRC_ACCESS */ + /* * Only root, or the user that did the original mount is * permitted to update it. @@ -867,6 +898,368 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 return(error); } +#ifdef CONFIG_IMGSRC_ACCESS +/* + * Flush in-core data, check for competing mount attempts, + * and set VMOUNT + */ +static int +prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname) +{ + struct vnode_attr va; + int error; + + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_uid); + if ((error = vnode_getattr(vp, &va, ctx)) || + (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && + (!vfs_context_issuser(ctx)))) { + error = EPERM; + goto out; + } + + if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) ) + goto out; + + if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) ) + goto out; + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) { + error = EBUSY; + goto out; + } + +#if CONFIG_MACF + error = mac_mount_check_mount(ctx, vp, + cnp, fsname); + if (error != 0) + goto out; +#endif + + vnode_lock_spin(vp); + SET(vp->v_flag, VMOUNT); + vnode_unlock(vp); + +out: + return error; +} + +static int +authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx) +{ + struct nameidata nd; + vnode_t vp; + mode_t accessmode; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); + if ( (error = namei(&nd)) ) + return error; + + strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + vp = nd.ni_vp; + nameidone(&nd); + + if (vp->v_type != VBLK) { + error = ENOTBLK; + goto out; + } + if (major(vp->v_rdev) >= nblkdev) { + error = ENXIO; + goto out; + } + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + */ + if (!vfs_context_issuser(ctx)) { + accessmode = KAUTH_VNODE_READ_DATA; + if ((mp->mnt_flag & MNT_RDONLY) == 0) + accessmode |= KAUTH_VNODE_WRITE_DATA; + if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) + goto out; + } + + *devvpp = vp; +out: + if (error) { + vnode_put(vp); + } + + return error; +} + +/* + * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode, + * and call checkdirs() + */ +static int +place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx) +{ + int error; + + mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */ + + vnode_lock_spin(vp); + CLR(vp->v_flag, VMOUNT); + vp->v_mountedhere = mp; + vnode_unlock(vp); + + /* + * taking the name_cache_lock exclusively will + * insure that everyone is out of the fast path who + * might be trying to use a now stale copy of + * vp->v_mountedhere->mnt_realrootvp + * bumping mount_generation causes the cached values + * to be invalidated + */ + name_cache_lock(); + mount_generation++; + name_cache_unlock(); + + error = vnode_ref(vp); + if (error != 0) { + goto out; + } + + error = checkdirs(vp, ctx); + if (error != 0) { + /* Unmount the filesystem as cdir/rdirs cannot be updated */ + vnode_rele(vp); + goto out; + } + +out: + if (error != 0) { + mp->mnt_vnodecovered = NULLVP; + } + return error; +} + +static void +undo_place_on_covered_vp(mount_t mp, vnode_t vp) +{ + vnode_rele(vp); + vnode_lock_spin(vp); + vp->v_mountedhere = (mount_t)NULL; + vnode_unlock(vp); + + mp->mnt_vnodecovered = NULLVP; +} + +static int +mount_begin_update(mount_t mp, vfs_context_t ctx, int flags) +{ + int error; + + /* unmount in progress return error */ + mount_lock_spin(mp); + if (mp->mnt_lflag & MNT_LUNMOUNT) { + mount_unlock(mp); + return EBUSY; + } + mount_unlock(mp); + lck_rw_lock_exclusive(&mp->mnt_rwlock); + + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((flags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + error = ENOTSUP; + goto out; + } + + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) && + (!vfs_context_issuser(ctx))) { + error = EPERM; + goto out; + } +#if CONFIG_MACF + error = mac_mount_check_remount(ctx, mp); + if (error != 0) { + goto out; + } +#endif + +out: + if (error) { + lck_rw_done(&mp->mnt_rwlock); + } + + return error; +} + +static void +mount_end_update(mount_t mp) +{ + lck_rw_done(&mp->mnt_rwlock); +} + +static int +relocate_imageboot_source(vnode_t vp, struct componentname *cnp, + const char *fsname, vfs_context_t ctx, + boolean_t is64bit, user_addr_t fsmountargs) +{ + int error; + mount_t mp; + boolean_t placed = FALSE; + vnode_t devvp; + struct vfstable *vfsp; + user_addr_t devpath; + char *old_mntonname; + + /* If we didn't imageboot, nothing to move */ + if (imgsrc_rootvnode == NULLVP) { + return EINVAL; + } + + /* Only root can do this */ + if (!vfs_context_issuser(ctx)) { + return EPERM; + } + + error = vnode_get(imgsrc_rootvnode); + if (error != 0) { + return error; + } + + MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK); + + /* Can only move once */ + mp = vnode_mount(imgsrc_rootvnode); + if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + error = EBUSY; + goto out0; + } + + /* Get exclusive rwlock on mount, authorize update on mp */ + error = mount_begin_update(mp , ctx, 0); + if (error != 0) { + goto out0; + } + + /* + * It can only be moved once. Flag is set under the rwlock, + * so we're now safe to proceed. + */ + if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + goto out1; + } + + /* Mark covered vnode as mount in progress, authorize placing mount on top */ + error = prepare_coveredvp(vp, ctx, cnp, fsname); + if (error != 0) { + goto out1; + } + + /* Sanity check the name caller has provided */ + vfsp = mp->mnt_vtable; + if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) { + error = EINVAL; + goto out2; + } + + /* Check the device vnode and update mount-from name, for local filesystems */ + if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { + if (is64bit) { + if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) + goto out2; + fsmountargs += sizeof(devpath); + } else { + user32_addr_t tmp; + if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) + goto out2; + /* munge into LP64 addr */ + devpath = CAST_USER_ADDR_T(tmp); + fsmountargs += sizeof(tmp); + } + + if (devpath != USER_ADDR_NULL) { + error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx); + if (error) { + goto out2; + } + + vnode_put(devvp); + } + } + + /* + * Place mp on top of vnode, ref the vnode, call checkdirs(), + * and increment the name cache's mount generation + */ + error = place_mount_and_checkdirs(mp, vp, ctx); + if (error != 0) { + goto out2; + } + + placed = TRUE; + + strncpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN); + strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN); + + /* Forbid future moves */ + mount_lock(mp); + mp->mnt_kern_flag |= MNTK_HAS_MOVED; + mount_unlock(mp); + + /* Finally, add to mount list, completely ready to go */ + error = mount_list_add(mp); + if (error != 0) { + goto out3; + } + + mount_end_update(mp); + vnode_put(imgsrc_rootvnode); + FREE(old_mntonname, M_TEMP); + + return 0; +out3: + strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN); + + mount_lock(mp); + mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED); + mount_unlock(mp); + +out2: + /* + * Placing the mp on the vnode clears VMOUNT, + * so cleanup is different after that point + */ + if (placed) { + /* Rele the vp, clear VMOUNT and v_mountedhere */ + undo_place_on_covered_vp(mp, vp); + } else { + vnode_lock_spin(vp); + CLR(vp->v_flag, VMOUNT); + vnode_unlock(vp); + } +out1: + mount_end_update(mp); + +out0: + vnode_put(imgsrc_rootvnode); + FREE(old_mntonname, M_TEMP); + return error; +} + +#endif /* CONFIG_IMGSRC_ACCESS */ + void enablequotas(struct mount *mp, vfs_context_t ctx) { @@ -1086,6 +1479,13 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) goto out; } +#ifdef CONFIG_IMGSRC_ACCESS + if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) { + error = EBUSY; + goto out; + } +#endif /* CONFIG_IMGSRC_ACCESS */ + return (dounmount(mp, flags, 1, ctx)); out: diff --git a/config/Makefile b/config/Makefile index c28573b0a..9a00f1027 100644 --- a/config/Makefile +++ b/config/Makefile @@ -221,5 +221,18 @@ do_build_all: build_symbol_sets do_build_install: install_symbol_sets +EXPORTS_FILE_LIST = $(addprefix $(SOURCE)/,$(foreach set,$(SYMBOL_COMPONENT_LIST), $(set).exports $(set).$(ARCH_CONFIG_LC).exports)) +EXPORTS_FILE_LIST_NOSYSTEM60 = $(addprefix $(SOURCE)/,$(foreach set, $(filter-out System6.0,$(SYMBOL_COMPONENT_LIST)), $(set).exports $(set).$(ARCH_CONFIG_LC).exports)) + +# Does not include "whole-kernel" clients +build_mach_kernel_exports: + $(_v)if [ $(SUPPORT_SYSTEM60_KEXT) -eq 1 ]; then \ + $(SOURCE)/generate_linker_exports.sh $(OBJPATH)/kernel-kpi.exp \ + $(EXPORTS_FILE_LIST) || exit 1; \ + else \ + $(SOURCE)/generate_linker_exports.sh $(OBJPATH)/kernel-kpi.exp \ + $(EXPORTS_FILE_LIST_NOSYSTEM60) || exit 1; \ + fi; + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/config/MasterVersion b/config/MasterVersion index 44aab8c5c..6b41eba65 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -10.2.0 +10.3.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/generate_linker_exports.sh b/config/generate_linker_exports.sh new file mode 100755 index 000000000..4af69e9b0 --- /dev/null +++ b/config/generate_linker_exports.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +set -e + +if [ $# -lt 2 ]; then + echo "Usage: $0 output.exp input1 [input2 ... ]" 1>&2 + exit 1 +fi + +OUTPUT="$1" +shift + +( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $2}' ) | sort -u > "$OUTPUT" + +exit 0 diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 603432900..66c178d5f 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -482,7 +482,8 @@ IODMACommand::walkAll(UInt8 op) if (state->fLocalMapper) { - state->fLocalMapperPageCount = atop_64(round_page(state->fPreparedLength)); + state->fLocalMapperPageCount = atop_64(round_page( + state->fPreparedLength + ((state->fPreparedOffset + fMDSummary.fPageAlign) & page_mask))); state->fLocalMapperPageAlloc = fMapper->iovmAllocDMACommand(this, state->fLocalMapperPageCount); state->fMapContig = true; } diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index fd150a4f6..c7b4319ed 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -2950,6 +2950,17 @@ IOReturn IOService::startPowerChange ( &powerState, changeFlags); #endif + // Invalidate the last recorded tickle power state when a power transition + // is about to occur, and not as a result of a tickle request. + + if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) && + (fActivityTicklePowerState != -1)) + { + IOLockLock(fActivityLock); + fActivityTicklePowerState = -1; + IOLockUnlock(fActivityLock); + } + // Initialize the change note. fHeadNoteFlags = changeFlags; diff --git a/kgmacros b/kgmacros index 4f6fc2bbe..82cd0eeff 100644 --- a/kgmacros +++ b/kgmacros @@ -64,6 +64,7 @@ document kgm | showtaskvme Display info about the task's vm_map entries | showtaskipc Display info about the specified task's ipc space | showtaskrights Display info about the task's ipc space entries +| showtaskrightsbt Display info about the task's ipc space entries with back traces | showtaskbusyports Display all of the task's ports with unread messages | | showact Display info about a thread specified by activation @@ -213,7 +214,8 @@ document kgm | showallgdbcorestacks Corefile equivalent of "showallgdbstacks" | kdp-reenter Schedule reentry into the debugger and continue. | kdp-reboot Restart remote target -| kdp-version Get KDP version number +| kdp-version Get KDP version number +| kdp-connect "shorthand" connection macro | | zstack Print zalloc caller stack (zone leak debugging) | findoldest Find oldest zone leak debugging record @@ -1136,6 +1138,10 @@ define showipcint if $kgm_ie.ie_bits & 0x001f0000 set $kgm_name = (($kgm_iindex << 8)|($kgm_ie.ie_bits >> 24)) showipceint $kgm_iep $kgm_name + if $arg2 != 0 && $kgm_ie.ie_object != 0 && ($kgm_ie.ie_bits & 0x00070000) && ((ipc_port_t) $kgm_ie.ie_object)->ip_callstack[0] != 0 + printf " user bt: " + showportbt $kgm_ie.ie_object $kgm_is.is_task + end end set $kgm_iindex = $kgm_iindex + 1 set $kgm_iep = &($kgm_is.is_table[$kgm_iindex]) @@ -1151,7 +1157,7 @@ end define showipc set $kgm_isp = (ipc_space_t)$arg0 showipcheader - showipcint $kgm_isp 0 + showipcint $kgm_isp 0 0 end document showipc Syntax: (gdb) showipc @@ -1161,7 +1167,7 @@ end define showrights set $kgm_isp = (ipc_space_t)$arg0 showipcheader - showipcint $kgm_isp 1 + showipcint $kgm_isp 1 0 end document showrights Syntax: (gdb) showrights @@ -1174,7 +1180,7 @@ define showtaskipc showtaskheader showipcheader showtaskint $kgm_taskp - showipcint $kgm_taskp->itk_space 0 + showipcint $kgm_taskp->itk_space 0 0 end document showtaskipc Syntax: (gdb) showtaskipc @@ -1187,13 +1193,25 @@ define showtaskrights showtaskheader showipcheader showtaskint $kgm_taskp - showipcint $kgm_taskp->itk_space 1 + showipcint $kgm_taskp->itk_space 1 0 end document showtaskrights Syntax: (gdb) showtaskrights | Routine to print info about the ipc rights for a task end +define showtaskrightsbt + set $kgm_taskp = (task_t)$arg0 + showtaskheader + showipcheader + showtaskint $kgm_taskp + showipcint $kgm_taskp->itk_space 1 1 +end +document showtaskrightsbt +Syntax: (gdb) showtaskrightsbt +| Routine to print info about the ipc rights for a task with backtraces +end + define showallipc set $kgm_head_taskp = &tasks set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next) @@ -1201,7 +1219,7 @@ define showallipc showtaskheader showipcheader showtaskint $kgm_cur_taskp - showipcint $kgm_cur_taskp->itk_space 0 + showipcint $kgm_cur_taskp->itk_space 0 0 set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) end end @@ -1218,7 +1236,7 @@ define showallrights showtaskheader showipcheader showtaskint $kgm_cur_taskp - showipcint $kgm_cur_taskp->itk_space 1 + showipcint $kgm_cur_taskp->itk_space 1 0 set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next) end end @@ -1631,6 +1649,25 @@ define showportmember printf "0x%08x\n", $kgm_portp->ip_messages.data.port.msgcount end +define showportbt + set $kgm_iebt = ((ipc_port_t) $arg0)->ip_callstack + set $kgm_iepid = ((ipc_port_t) $arg0)->ip_spares[0] + set $kgm_procpid = ((proc_t) (((task_t) $arg1)->bsd_info))->p_pid + if $kgm_iebt[0] != 0 + showptr $kgm_iebt[0] + set $kgm_iebt_loop_ctr = 1 + while ($kgm_iebt_loop_ctr < 16 && $kgm_iebt[$kgm_iebt_loop_ctr]) + printf " " + showptr $kgm_iebt[$kgm_iebt_loop_ctr] + set $kgm_iebt_loop_ctr = $kgm_iebt_loop_ctr + 1 + end + if $kgm_iepid != $kgm_procpid + printf " (%d)", $kgm_iepid + end + printf "\n" + end +end + define showportint printf "0x%08x ", $arg0 set $kgm_portp = (struct ipc_port *)$arg0 @@ -2562,7 +2599,7 @@ define getdumpinfo dumpinfoint KDP_DUMPINFO_GETINFO set $kgm_dumpinfo = (kdp_dumpinfo_reply_t *) manual_pkt.data if $kgm_dumpinfo->type & KDP_DUMPINFO_REBOOT - printf "Sysem will reboot after kernel info gets dumped.\n" + printf "System will reboot after kernel info gets dumped.\n" else printf "Sysem will not reboot after kernel info gets dumped.\n" end @@ -10182,3 +10219,17 @@ Syntax: showallbusyports |Routine to print information about all receive rights on the system that |have enqueued messages. end + +define kdp-connect + if $argc > 0 + kdp-reattach $arg0 + else + printf "Attempting to attach to localhost...\n" + kdp-reattach localhost + end +end + +document kdp-connect +Syntax: (gdb) kdpconnect +| Attach to the machine with given hostname or IP address, or 'localhost' if blank +end diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index 9f4116491..acc3e3e98 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -3949,21 +3949,6 @@ OSKext::load( Boolean alreadyLoaded = false; OSKext * lastLoadedKext = NULL; - if (!sLoadEnabled) { - if (!isLoaded() || (!isStarted() && startOpt != kOSKextExcludeNone) || - (startMatchingOpt != kOSKextExcludeNone)) { - - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogLoadFlag, - "Kext loading is disabled " - "(attempt to load/start/start matching for kext %s).", - getIdentifierCString()); - } - result = kOSKextReturnDisabled; - goto finish; - } - if (isLoaded()) { alreadyLoaded = true; result = kOSReturnSuccess; @@ -3976,6 +3961,16 @@ OSKext::load( goto loaded; } + if (!sLoadEnabled) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext loading is disabled (attempt to load kext %s).", + getIdentifierCString()); + result = kOSKextReturnDisabled; + goto finish; + } + /* If we've pushed the next available load tag to the invalid value, * we can't load any more kexts. */ @@ -4136,9 +4131,7 @@ OSKext::load( OSKext::saveLoadedKextPanicList(); loaded: - /* This is a bit of a hack, because we shouldn't be handling - * personalities within the load function. - */ + if (declaresExecutable() && (startOpt == kOSKextExcludeNone)) { result = start(); if (result != kOSReturnSuccess) { @@ -4152,12 +4145,32 @@ OSKext::load( /* If not excluding matching, send the personalities to the kernel. * This never affects the result of the load operation. + * This is a bit of a hack, because we shouldn't be handling + * personalities within the load function. */ if (result == kOSReturnSuccess && startMatchingOpt == kOSKextExcludeNone) { - sendPersonalitiesToCatalog(true, personalityNames); + result = sendPersonalitiesToCatalog(true, personalityNames); } - finish: + + /* More hack! If the kext doesn't declare an executable, even if we + * "loaded" it, we have to remove any personalities naming it, or we'll + * never see the registry go quiet. Errors here do not count for the + * load operation itself. + * + * Note that in every other regard it's perfectly ok for a kext to + * not declare an executable and serve only as a package for personalities + * naming another kext, so we do have to allow such kexts to be "loaded" + * so that those other personalities get added & matched. + */ + if (!declaresExecutable()) { + OSKextLog(this, + kOSKextLogStepLevel | kOSKextLogLoadFlag, + "Kext %s has no executable; removing any personalities naming it.", + getIdentifierCString()); + removePersonalitiesFromCatalog(); + } + if (result != kOSReturnSuccess) { OSKextLog(this, kOSKextLogErrorLevel | @@ -4721,6 +4734,16 @@ OSKext::start(bool startDependenciesFlag) goto finish; } + if (!sLoadEnabled) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext loading is disabled (attempt to start kext %s).", + getIdentifierCString()); + result = kOSKextReturnDisabled; + goto finish; + } + result = validateKextMapping(/* start? */ true); if (result != kOSReturnSuccess) { goto finish; @@ -7763,15 +7786,26 @@ OSKext::copyPersonalitiesArray(void) /********************************************************************* Might want to change this to a bool return? *********************************************************************/ -void +OSReturn OSKext::sendPersonalitiesToCatalog( bool startMatching, OSArray * personalityNames) { - OSArray * personalitiesToSend = NULL; // must release - OSDictionary * kextPersonalities = NULL; // do not release + OSReturn result = kOSReturnSuccess; + OSArray * personalitiesToSend = NULL; // must release + OSDictionary * kextPersonalities = NULL; // do not release int count, i; + if (!sLoadEnabled) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogLoadFlag, + "Kext loading is disabled (attempt to start matching for kext %s).", + getIdentifierCString()); + result = kOSKextReturnDisabled; + goto finish; + } + if (sSafeBoot && !isLoadableInSafeBoot()) { OSKextLog(this, kOSKextLogErrorLevel | @@ -7779,7 +7813,8 @@ OSKext::sendPersonalitiesToCatalog( "Kext %s is not loadable during safe boot; " "not sending personalities to the IOCatalogue.", getIdentifierCString()); - return; + result = kOSKextReturnNotLoadable; + goto finish; } if (!personalityNames || !personalityNames->getCount()) { @@ -7788,10 +7823,12 @@ OSKext::sendPersonalitiesToCatalog( kextPersonalities = OSDynamicCast(OSDictionary, getPropertyForHostArch(kIOKitPersonalitiesKey)); if (!kextPersonalities || !kextPersonalities->getCount()) { + // not an error goto finish; } personalitiesToSend = OSArray::withCapacity(0); if (!personalitiesToSend) { + result = kOSKextReturnNoMemory; goto finish; } count = personalityNames->getCount(); @@ -7824,10 +7861,12 @@ OSKext::sendPersonalitiesToCatalog( if (personalitiesToSend) { personalitiesToSend->release(); } - return; + return result; } /********************************************************************* +* xxx - We should allow removing the kext's declared personalities, +* xxx - even with other bundle identifiers. *********************************************************************/ void OSKext::removePersonalitiesFromCatalog(void) diff --git a/libkern/conf/MASTER b/libkern/conf/MASTER index 2ca7aafb2..b303c3603 100644 --- a/libkern/conf/MASTER +++ b/libkern/conf/MASTER @@ -67,3 +67,6 @@ options IPSEC # IP security # options CONFIG_KXLD # kxld/runtime linking of kexts # +# secure_kernel - secure kernel from user programs +options SECURE_KERNEL # + diff --git a/libkern/conf/files b/libkern/conf/files index c763d0ac4..15f992d67 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -68,6 +68,7 @@ libkern/stack_protector.c standard libkern/kxld/kxld.c optional config_kxld libkern/kxld/kxld_array.c optional config_kxld libkern/kxld/kxld_copyright.c optional config_kxld +libkern/kxld/kxld_demangle.c optional config_kxld libkern/kxld/kxld_dict.c optional config_kxld libkern/kxld/kxld_kext.c optional config_kxld libkern/kxld/kxld_reloc.c optional config_kxld diff --git a/libkern/kxld/Makefile b/libkern/kxld/Makefile index 84412e08f..9bc3566c6 100644 --- a/libkern/kxld/Makefile +++ b/libkern/kxld/Makefile @@ -59,7 +59,7 @@ CFLAGS=-std=c99 -Wall -Wextra -Werror -pedantic -Wformat=2 -Wcast-align \ -isysroot $(SDKROOT) LDFLAGS=$(ARCHS) -dynamiclib -install_name $(LIBKXLDNAME) \ -compatibility_version $(COMPATIBILITY_VERSION) \ - -current_version $(CURRENT_VERSION) -isysroot $(SDKROOT) + -current_version $(CURRENT_VERSION) -isysroot $(SDKROOT) -lstdc++ INCLUDES=-I$(HDRSRC) $(INCFLAGS_EXTERN) # Tools @@ -74,9 +74,9 @@ endif # Files HDR_NAMES=kxld.h kxld_types.h -OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_dict.o kxld_kext.o kxld_reloc.o \ - kxld_sect.o kxld_seg.o kxld_sym.o kxld_state.o kxld_symtab.o kxld_util.o \ - kxld_uuid.o kxld_vtable.o +OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_demangle.o kxld_dict.o \ + kxld_kext.o kxld_reloc.o kxld_sect.o kxld_seg.o kxld_sym.o kxld_state.o \ + kxld_symtab.o kxld_util.o kxld_uuid.o kxld_vtable.o HDRS=$(addprefix $(HDRSRC)/, $(HDR_NAMES)) OBJS=$(addprefix $(OBJROOT)/, $(OBJ_NAMES)) diff --git a/libkern/kxld/kxld_array.c b/libkern/kxld/kxld_array.c index b04a6045a..9720f3d08 100644 --- a/libkern/kxld/kxld_array.c +++ b/libkern/kxld/kxld_array.c @@ -86,8 +86,8 @@ kxld_array_init(KXLDArray *array, size_t itemsize, u_int nitems) */ if (array->maxitems < nitems) { STAILQ_FOREACH_SAFE(srcpool, &array->pools, entries, tmp) { - STAILQ_INSERT_TAIL(&srcpools, srcpool, entries); STAILQ_REMOVE(&array->pools, srcpool, kxld_array_pool, entries); + STAILQ_INSERT_TAIL(&srcpools, srcpool, entries); } srcpool_capacity = array->pool_capacity; bzero(array, sizeof(*array)); diff --git a/libkern/kxld/kxld_demangle.c b/libkern/kxld/kxld_demangle.c new file mode 100644 index 000000000..98ca4d55a --- /dev/null +++ b/libkern/kxld/kxld_demangle.c @@ -0,0 +1,46 @@ +#if !KERNEL + +#include + +/* This demangler is part of the C++ ABI. We don't include it directly from + * so that we can avoid using C++ in the kernel linker. + */ +extern char * +__cxa_demangle(const char* __mangled_name, char* __output_buffer, + size_t* __length, int* __status); + +#endif /* !KERNEL */ + +#include "kxld_demangle.h" + +/******************************************************************************* +*******************************************************************************/ +const char * +kxld_demangle(const char *str, char **buffer __unused, size_t *length __unused) +{ +#if KERNEL + return str; +#else + const char *rval = NULL; + char *demangled = NULL; + int status; + + if (!str) goto finish; + + rval = str; + + if (!buffer || !length) goto finish; + + /* Symbol names in the symbol table have an extra '_' prepended to them, + * so we skip the first character to make the demangler happy. + */ + demangled = __cxa_demangle(str+1, *buffer, length, &status); + if (!demangled || status) goto finish; + + *buffer = demangled; + rval = demangled; +finish: + return rval; +#endif +} + diff --git a/libkern/kxld/kxld_demangle.h b/libkern/kxld/kxld_demangle.h new file mode 100644 index 000000000..1fee33193 --- /dev/null +++ b/libkern/kxld/kxld_demangle.h @@ -0,0 +1,24 @@ +#ifndef _KXLD_DEMANGLE_H_ +#define _KXLD_DEMANGLE_H_ + +#include + +/* @function kxld_demangle + + * @abstract Demangles c++ symbols. + * + * @param str The C-string to be demangled. + * @param buffer A pointer to a character buffer for storing the result. + * If NULL, a buffer will be malloc'd and stored here. + * If the buffer is not large enough, it will be realloc'd. + * + * @param length The length of the buffer. + * + * @result If the input string could be demangled, it returns the + * demangled string. Otherwise, returns the input string. + * + */ +const char * kxld_demangle(const char *str, char **buffer, size_t *length) + __attribute__((pure, nonnull, visibility("hidden"))); + +#endif /* !_KXLD_DEMANGLE_H_ */ diff --git a/libkern/kxld/kxld_kext.c b/libkern/kxld/kxld_kext.c index 7b5623003..a5520711e 100644 --- a/libkern/kxld/kxld_kext.c +++ b/libkern/kxld/kxld_kext.c @@ -51,6 +51,7 @@ #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" #include +#include "kxld_demangle.h" #include "kxld_dict.h" #include "kxld_kext.h" #include "kxld_reloc.h" @@ -1096,6 +1097,10 @@ create_vtables(KXLDKext *kext) char class_name[KXLD_MAX_NAME_LEN]; char vtable_name[KXLD_MAX_NAME_LEN]; char meta_vtable_name[KXLD_MAX_NAME_LEN]; + char *demangled_name1 = NULL; + char *demangled_name2 = NULL; + size_t demangled_length1 = 0; + size_t demangled_length2 = 0; u_int i = 0; u_int nvtables = 0; @@ -1161,7 +1166,10 @@ create_vtables(KXLDKext *kext) } else { kxld_log(kKxldLogPatching, kKxldLogErr, "Warning: " kKxldLogMissingVtable, - meta_vtable_name, class_name); + kxld_demangle(meta_vtable_name, &demangled_name1, + &demangled_length1), + kxld_demangle(class_name, &demangled_name2, + &demangled_length2)); kxld_array_resize(&kext->vtables, --nvtables); } } @@ -1231,6 +1239,10 @@ create_vtables(KXLDKext *kext) rval = KERN_SUCCESS; finish: + + if (demangled_name1) kxld_free(demangled_name1, demangled_length1); + if (demangled_name2) kxld_free(demangled_name2, demangled_length2); + return rval; } @@ -1950,6 +1962,8 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, boolean_t tests_for_weak = FALSE; boolean_t error = FALSE; boolean_t warning = FALSE; + char *demangled_name = NULL; + size_t demangled_length = 0; check(kext); check(defined_symbols); @@ -1981,8 +1995,8 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, "The following symbols were defined more than once:"); } - kxld_log(kKxldLogLinking, kKxldLogErr, - "\t%s: %p - %p", sym->name, + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s: %p - %p", + kxld_demangle(sym->name, &demangled_name, &demangled_length), (void *) (uintptr_t) sym->link_addr, (void *) (uintptr_t) addr); } @@ -2011,7 +2025,8 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, "The following are common symbols:"); } } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name); + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", + kxld_demangle(sym->name, &demangled_name, &demangled_length)); } else { @@ -2045,7 +2060,8 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, if (obsolete_symbols && kxld_dict_find(obsolete_symbols, name)) { kxld_log(kKxldLogLinking, kKxldLogWarn, - "This kext uses obsolete symbol %s.", name); + "This kext uses obsolete symbol %s.", + kxld_demangle(name, &demangled_name, &demangled_length)); } } else if (kext->link_type == KXLD_LINK_PSEUDO_KEXT) { @@ -2058,7 +2074,8 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, "This symbol set has the following unresolved symbols:"); warning = TRUE; } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name); + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", + kxld_demangle(sym->name, &demangled_name, &demangled_length)); kxld_sym_delete(sym); } else if (kxld_sym_is_weak(sym)) { @@ -2092,6 +2109,7 @@ resolve_symbols(KXLDKext *kext, KXLDDict *defined_symbols, rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } @@ -2148,6 +2166,10 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, char vtable_name[KXLD_MAX_NAME_LEN]; char super_vtable_name[KXLD_MAX_NAME_LEN]; char final_sym_name[KXLD_MAX_NAME_LEN]; + char *demangled_name1 = NULL; + char *demangled_name2 = NULL; + size_t demangled_length1 = 0;; + size_t demangled_length2 = 0; size_t len = 0; u_int nvtables = 0; u_int npatched = 0; @@ -2204,7 +2226,11 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, if (failure) { kxld_log(kKxldLogPatching, kKxldLogErr, - "\t%s (super vtable %s)", vtable_name, super_vtable_name); + "\t'%s' (super vtable '%s')", + kxld_demangle(vtable_name, &demangled_name1, + &demangled_length1), + kxld_demangle(super_vtable_name, &demangled_name2, + &demangled_length2)); continue; } @@ -2228,8 +2254,11 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, require_action(!final_sym, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - "Class %s is a subclass of final class %s.", - class_name, super_class_name)); + "Class '%s' is a subclass of final class '%s'.", + kxld_demangle(class_name, &demangled_name1, + &demangled_length1), + kxld_demangle(super_class_name, &demangled_name2, + &demangled_length2))); /* Patch the class's vtable */ rval = kxld_vtable_patch(vtable, super_vtable, kext->symtab, @@ -2297,6 +2326,9 @@ patch_vtables(KXLDKext *kext, KXLDDict *patched_vtables, rval = KERN_SUCCESS; finish: + if (demangled_name1) kxld_free(demangled_name1, demangled_length1); + if (demangled_name2) kxld_free(demangled_name2, demangled_length2); + return rval; } @@ -2309,6 +2341,8 @@ validate_symbols(KXLDKext *kext) KXLDSymtabIterator iter; KXLDSym *sym = NULL; u_int error = FALSE; + char *demangled_name = NULL; + size_t demangled_length = 0; /* Check for any unresolved symbols */ kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_unresolved, FALSE); @@ -2318,13 +2352,15 @@ validate_symbols(KXLDKext *kext) kxld_log(kKxldLogLinking, kKxldLogErr, "The following symbols are unresolved for this kext:"); } - kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name); + kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", + kxld_demangle(sym->name, &demangled_name, &demangled_length)); } require_noerr_action(error, finish, rval=KERN_FAILURE); rval = KERN_SUCCESS; finish: + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } diff --git a/libkern/kxld/kxld_util.h b/libkern/kxld/kxld_util.h index 9c387e670..3392b4a74 100644 --- a/libkern/kxld/kxld_util.h +++ b/libkern/kxld/kxld_util.h @@ -115,17 +115,17 @@ void kxld_log(KXLDLogSubsystem subsystem, KXLDLogLevel level, #define kKxldLogArchNotSupported "The target architecture (cputype 0x%x) is not supported by kxld." #define kKxldLogArchNotFound "The kext does not contain a fat slice for the target architecture." #define kKxldLogFiletypeNotSupported "The Mach-O filetype 0x%x is not supported on the target architecture." -#define kKxldLogTruncatedMachO "The Mach-O file has been truncated. Make sure the Mach-O header structures are correct." +#define kKxldLogTruncatedMachO "The Mach-O file has been truncated. Make sure the Mach-O header structures are correct." #define kKxldLogMalformedMachO "The Mach-O file is malformed: " -#define kKxldLogMalformedVTable "The vtable %s is malformed. Make sure your kext has been built against the correct headers." -#define kKxldLogMissingVtable "Cannot find the vtable %s for class %s. This vtable symbol is required for binary compatibility, and it may have been stripped." -#define kKxldLogParentOutOfDate "The super class vtable %s for vtable %s is out of date. Make sure your kext has been built against the correct headers." +#define kKxldLogMalformedVTable "The vtable '%s' is malformed. Make sure your kext has been built against the correct headers." +#define kKxldLogMissingVtable "Cannot find the vtable '%s' for class '%s'. This vtable symbol is required for binary compatibility, and it may have been stripped." +#define kKxldLogParentOutOfDate "The super class vtable '%s' for vtable '%s' is out of date. Make sure your kext has been built against the correct headers." #define kKxldLogNoKmodInfo "The kext is missing its kmod_info structure." #define kKxldLogInvalidSectReloc "Relocation entry %u from section %s,%s cannot be processed." #define kKxldLogInvalidExtReloc "External relocation entry %u cannot be processed." #define kKxldLogInvalidIntReloc "Internal relocation entry %u cannot be processed." -#define kKxldLogRelocationOverflow "A relocation entry has overflowed. The kext may be too far from one " \ - "of its dependencies. Check your kext's load address." +#define kKxldLogRelocationOverflow "A relocation entry has overflowed. The kext may be too far from one " \ + "of its dependencies. Check your kext's load address." /******************************************************************************* * Allocators diff --git a/libkern/kxld/kxld_vtable.c b/libkern/kxld/kxld_vtable.c index 78e647e6b..208c030d9 100644 --- a/libkern/kxld/kxld_vtable.c +++ b/libkern/kxld/kxld_vtable.c @@ -32,6 +32,7 @@ #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" #include +#include "kxld_demangle.h" #include "kxld_reloc.h" #include "kxld_sect.h" #include "kxld_state.h" @@ -73,6 +74,8 @@ kxld_vtable_init_from_kernel_macho(KXLDVTable *vtable, const KXLDSym *sym, const KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; + char *demangled_name = NULL; + size_t demangled_length = 0; check(vtable); check(sym); @@ -86,7 +89,8 @@ kxld_vtable_init_from_kernel_macho(KXLDVTable *vtable, const KXLDSym *sym, require_action(kxld_sect_get_num_relocs(sect) == 0, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, vtable->name)); + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, &demangled_name, &demangled_length))); rval = init_by_entries(vtable, symtab, relocator); require_noerr(rval, finish); @@ -96,8 +100,8 @@ kxld_vtable_init_from_kernel_macho(KXLDVTable *vtable, const KXLDSym *sym, rval = KERN_SUCCESS; finish: - if (rval) kxld_vtable_deinit(vtable); + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } @@ -110,6 +114,8 @@ kxld_vtable_init_from_object_macho(KXLDVTable *vtable, const KXLDSym *sym, const KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; + char *demangled_name = NULL; + size_t demangled_length = 0; check(vtable); check(sym); @@ -123,7 +129,8 @@ kxld_vtable_init_from_object_macho(KXLDVTable *vtable, const KXLDSym *sym, require_action(kxld_sect_get_num_relocs(sect) > 0, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, vtable->name)); + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, &demangled_name, &demangled_length))); rval = init_by_relocs(vtable, sym, sect, symtab, relocator); require_noerr(rval, finish); @@ -131,8 +138,8 @@ kxld_vtable_init_from_object_macho(KXLDVTable *vtable, const KXLDSym *sym, rval = KERN_SUCCESS; finish: - if (rval) kxld_vtable_deinit(vtable); + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } @@ -145,6 +152,8 @@ kxld_vtable_init_from_final_macho(KXLDVTable *vtable, const KXLDSym *sym, const KXLDRelocator *relocator, const KXLDArray *relocs) { kern_return_t rval = KERN_FAILURE; + char *demangled_name = NULL; + size_t demangled_length = 0; check(vtable); check(sym); @@ -158,7 +167,8 @@ kxld_vtable_init_from_final_macho(KXLDVTable *vtable, const KXLDSym *sym, require_action(kxld_sect_get_num_relocs(sect) == 0, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, vtable->name)); + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, &demangled_name, &demangled_length))); rval = init_by_entries_and_relocs(vtable, sym, symtab, relocator, relocs); @@ -168,6 +178,7 @@ kxld_vtable_init_from_final_macho(KXLDVTable *vtable, const KXLDSym *sym, finish: if (rval) kxld_vtable_deinit(vtable); + if (demangled_name) kxld_free(demangled_name, demangled_length); return rval; } @@ -499,6 +510,8 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, kxld_addr_t entry_offset = 0; u_int nentries = 0; u_int i = 0; + char *demangled_name1 = NULL; + size_t demangled_length1 = 0; check(vtable); check(sym); @@ -573,7 +586,9 @@ init_by_entries_and_relocs(KXLDVTable *vtable, const KXLDSym *sym, require_action(reloc, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, vtable->name)); + kKxldLogMalformedVTable, + kxld_demangle(vtable->name, &demangled_name1, + &demangled_length1))); tmpsym = kxld_reloc_get_symbol(relocator, reloc, /* data */ NULL, symtab); @@ -630,6 +645,12 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, KXLDSym *sym = NULL; u_int symindex = 0; u_int i = 0; + char *demangled_name1 = NULL; + char *demangled_name2 = NULL; + char *demangled_name3 = NULL; + size_t demangled_length1 = 0; + size_t demangled_length2 = 0; + size_t demangled_length3 = 0; check(vtable); check(super_vtable); @@ -637,8 +658,8 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, require_action(!vtable->is_patched, finish, rval=KERN_SUCCESS); require_action(vtable->entries.nitems >= super_vtable->entries.nitems, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogMalformedVTable, vtable->name)); + kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMalformedVTable, + kxld_demangle(vtable->name, &demangled_name1, &demangled_length1))); for (i = 0; i < super_vtable->entries.nitems; ++i) { child_entry = kxld_array_get_item(&vtable->entries, i); @@ -688,7 +709,11 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, require_action(!kxld_sym_name_is_padslot(parent_entry->patched.name), finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, - kKxldLogParentOutOfDate, super_vtable->name, vtable->name)); + kKxldLogParentOutOfDate, + kxld_demangle(super_vtable->name, &demangled_name1, + &demangled_length1), + kxld_demangle(vtable->name, &demangled_name2, + &demangled_length2))); #if KXLD_USER_OR_STRICT_PATCHING /* 5) If we are doing strict patching, we prevent kexts from declaring @@ -748,8 +773,11 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, require_noerr(rval, finish); kxld_log(kKxldLogPatching, kKxldLogDetail, - "In vtable %s, patching %s with %s.", - vtable->name, child_entry->unpatched.sym->name, sym->name); + "In vtable '%s', patching '%s' with '%s'.", + kxld_demangle(vtable->name, &demangled_name1, &demangled_length1), + kxld_demangle(child_entry->unpatched.sym->name, + &demangled_name2, &demangled_length2), + kxld_demangle(sym->name, &demangled_name3, &demangled_length3)); kxld_sym_patch(child_entry->unpatched.sym); child_entry->unpatched.sym = sym; @@ -779,6 +807,10 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, rval = KERN_SUCCESS; finish: + if (demangled_name1) kxld_free(demangled_name1, demangled_length1); + if (demangled_name2) kxld_free(demangled_name2, demangled_length2); + if (demangled_name3) kxld_free(demangled_name3, demangled_length3); + return rval; } diff --git a/libkern/libkern/OSAtomic.h b/libkern/libkern/OSAtomic.h index d8e157483..36d9127e2 100644 --- a/libkern/libkern/OSAtomic.h +++ b/libkern/libkern/OSAtomic.h @@ -49,7 +49,7 @@ extern "C" { * reading and updating of values. */ -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) /*! * @function OSCompareAndSwap64 diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index 14337f3ab..312d53993 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -399,7 +399,7 @@ class OSKext : public OSObject static void sendAllKextPersonalitiesToCatalog( bool startMatching = false); - virtual void sendPersonalitiesToCatalog( + virtual OSReturn sendPersonalitiesToCatalog( bool startMatching = false, OSArray * personalityNames = NULL); diff --git a/libkern/mkext.c b/libkern/mkext.c index e1fc062e1..86238fc35 100644 --- a/libkern/mkext.c +++ b/libkern/mkext.c @@ -25,12 +25,13 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include // For uintptr_t. #include #include + #define BASE 65521L /* largest prime smaller than 65536 */ -#define NMAX 5000 -// NMAX (was 5521) the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 +#define NMAX 5552 // the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 #define DO1(buf,i) {s1 += buf[i]; s2 += s1;} #define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); @@ -45,6 +46,23 @@ mkext_adler32(uint8_t *buf, int32_t len) unsigned long s2 = 0; // (adler >> 16) & 0xffff; int k; +#if defined _ARM_ARCH_6 + + /* align buf to 16-byte boundary */ + while ((((uintptr_t)buf)&15)&&(len>0)) { /* not on a 16-byte boundary */ + len--; + s1 += *buf++; + s2 += s1; + if (s1 >= BASE) s1 -= BASE; + } + s2 %= BASE; + + if (len>=16) { + return adler32_vec(s1, s2, buf, len); + } + +#endif + while (len > 0) { k = len < NMAX ? len : NMAX; len -= k; diff --git a/libkern/zlib/adler32.c b/libkern/zlib/adler32.c index c94fde187..bf0d9723a 100644 --- a/libkern/zlib/adler32.c +++ b/libkern/zlib/adler32.c @@ -32,6 +32,9 @@ /* @(#) $Id$ */ +#include // For uintptr_t. + + #define ZLIB_INTERNAL #if KERNEL #include @@ -39,6 +42,10 @@ #include "zlib.h" #endif /* KERNEL */ +#if defined _ARM_ARCH_6 + extern uLong adler32_vec(uLong adler, uLong sum2, const Bytef *buf, uInt len); +#endif + #define BASE 65521UL /* largest prime smaller than 65536 */ #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ @@ -91,7 +98,9 @@ uLong ZEXPORT adler32(adler, buf, len) uInt len; { unsigned long sum2; +#if !defined _ARM_ARCH_6 unsigned n; +#endif /* split Adler-32 into component sums */ sum2 = (adler >> 16) & 0xffff; @@ -124,6 +133,20 @@ uLong ZEXPORT adler32(adler, buf, len) return adler | (sum2 << 16); } +#if defined _ARM_ARCH_6 + /* align buf to 16-byte boundary */ + while (((uintptr_t)buf)&15) { /* not on a 16-byte boundary */ + len--; + adler += *buf++; + sum2 += adler; + if (adler >= BASE) adler -= BASE; + MOD4(sum2); /* only added so many BASE's */ + } + + return adler32_vec(adler, sum2, buf, len); // armv7 neon vectorized implementation + +#else // _ARM_ARCH_6 + /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { len -= NMAX; @@ -153,6 +176,8 @@ uLong ZEXPORT adler32(adler, buf, len) /* return recombined sums */ return adler | (sum2 << 16); + +#endif // _ARM_ARCH_6 } /* ========================================================================= */ diff --git a/libkern/zlib/arm/adler32vec.s b/libkern/zlib/arm/adler32vec.s new file mode 100644 index 000000000..3af072caa --- /dev/null +++ b/libkern/zlib/arm/adler32vec.s @@ -0,0 +1,428 @@ +#include + +#define BASE 65521 /* largest prime smaller than 65536 */ +#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +// Note: buf should have been 16-byte aligned in the caller function, + +// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) { +// unsigned n; +// while (len >= NMAX) { +// len -= NMAX; +// n = NMAX / 16; /* NMAX is divisible by 16 */ +// do { +// DO16(buf); /* 16 sums unrolled */ +// buf += 16; +// } while (--n); +// MOD(adler); +// MOD(sum2); +// } +// if (len) { /* avoid modulos if none remaining */ +// while (len >= 16) { +// len -= 16; +// DO16(buf); +// buf += 16; +// } +// while (len--) { +// adler += *buf++; +// sum2 += adler; +// } +// MOD(adler); +// MOD(sum2); +// } +// return adler | (sum2 << 16); /* return recombined sums */ +// } + + +/* + DO16 vectorization: + given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that + sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]); + adler += (x[0] + x[1] + ... + x[15]); + + therefore, this is what can be done to vectorize the above computation + 1. 16-byte aligned vector load into q2 (x[0:x15]) + 2. sum2 += (adler<<4); + 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15] + 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15] + 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2 + + In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop. + 1. 32-byte aligned vector load into q2,q14 (x[0:x31]) + 2. sum2 += (adler<<5); + 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31] + 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31] + 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2 + + This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8. + +*/ + +/* + MOD implementation: + adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47 + 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47 + 2. vshr.u64 q2,q2,#47 // floor function + 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4 + 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE + +*/ + +#if defined _ARM_ARCH_6 // this file would be used only for armv6 or above + + + .text + .align 2 + .globl _adler32_vec +_adler32_vec: + +#if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support + + + #define adler r0 + #define sum2 r1 + #define buf r2 + #define len r3 + #define one_by_base r4 + #define base r5 + #define nmax r6 + #define t r12 + #define vecs lr + #define x0 r8 + #define x1 r10 + #define x2 r11 + #define x3 r12 + #define zero r9 + + // this macro performs adler/sum2 update for 4 input bytes + + .macro DO4 + add sum2, adler, lsl #2 // sum2 += 4*adler; + ldr x0,[buf] // 4 bytes in 1 32-bit word + usada8 adler, x0, zero, adler // adler += sum(x0:x3) + ldrb x0,[buf], #4 // x0 + ldrb x2,[buf,#-2] // x2 + ldrb x1,[buf,#-3] // x1 + ldrb x3,[buf,#-1] // x3 + add sum2, x0, lsl #2 // sum2 += 4*x0 + add x3, x3, x1, lsl #1 // x3+2*x1 + add sum2, x2, lsl #1 // sum2 += 2*x2 + add x3, x1 // x3+3*x1 + add sum2, x3 // sum2 += x3+3*x1 + .endm + + // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes + .macro DO16 + DO4 // adler/sum2 update for 4 input bytes + DO4 // adler/sum2 update for 4 input bytes + DO4 // adler/sum2 update for 4 input bytes + DO4 // adler/sum2 update for 4 input bytes + .endm + + // the following macro performs adler sum2 modulo BASE + .macro modulo_base + umull x0,x1,adler,one_by_base // adler/BASE in Q47 + umull x2,x3,sum2,one_by_base // sum2/BASE in Q47 + lsr x1, #15 // x1 >> 15 = floor(adler/BASE) + lsr x3, #15 // x3 >> 15 = floor(sum2/BASE) + mla adler, x1, base, adler // adler %= base; + mla sum2, x3, base, sum2 // sum2 %= base; + .endm + + adr t, coeffs + push {r4-r6, r8-r11, lr} + ldmia t, {one_by_base, base, nmax} // load up coefficients + + subs len, nmax // pre-subtract len by NMAX + eor zero, zero // a dummy zero register to use usada8 instruction + blt len_lessthan_NMAX // if (len < NMAX) skip the while loop + +while_lengenmax_loop: // do { + lsr vecs, nmax, #4 // vecs = NMAX/16; + +len16_loop: // do { + + DO16 + + subs vecs, #1 // vecs--; + bgt len16_loop // } while (vec>0); + + modulo_base // adler sum2 modulo BASE + + subs len, nmax // len -= NMAX + bge while_lengenmax_loop // } while (len >= NMAX); + +len_lessthan_NMAX: + adds len, nmax // post-subtract len by NMAX + + subs len, #16 // pre-decrement len by 16 + blt len_lessthan_16 + +len16_loop2: + + DO16 + + subs len, #16 + bge len16_loop2 + +len_lessthan_16: + adds len, #16 // post-increment len by 16 + beq len_is_zero + +remaining_buf: + ldrb x0, [buf], #1 + subs len, #1 + add adler, x0 + add sum2, adler + bgt remaining_buf + +len_is_zero: + + modulo_base // adler sum2 modulo BASE + + add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler + + pop {r4-r6, r8-r11, pc} + + .align 2 +coeffs: + .long -2146992015 + .long -BASE + .long NMAX + +#else // KERNEL_SUPPORT_NEON + + + + #define adler r0 + #define sum2 r1 + #define buf r2 + #define len r3 + #define nmax r4 + #define vecs lr // vecs = NMAX/16 + #define n r5 + + #define t r12 + + #define sum2_coeff q0 + #define sum2_coeff0 d0 + #define sum2_coeff1 d1 + #define alder_coeff q1 + #define ones d2 + #define x0_x15 q2 + #define x0_x7 d4 + #define x8_x15 d5 + #define adlersum2 d6 + #define adler16 d25 + +#if defined _ARM_ARCH_7 + + adr t, vec_table // address to vec_table[] + stmfd sp!, {r4, r5, lr} + + vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation + vld1.32 {q15},[t,:128]! // for sum2 computation + ldr nmax, [t] // NMAX + + vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register + + cmp len, nmax // len vs NMAX + lsr vecs, nmax, #4 // vecs = NMAX/16; + blt len_lessthan_NMAX // if (len < NMAX) skip the while loop + + sub len, nmax // pre-decrement len by NMAX + +while_len_ge_NMAX_loop: // while (len>=NMAX) { + + mov n, vecs, lsr #1 // n = NMAX/16; + +do_loop: // do { + + vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) + vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 + vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 + vld1.32 {q14}, [buf,:128]! // x16:x31 + vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 + vadd.u32 adlersum2,adler16 // sum2 += old adler*32; + vmull.u8 q12, d28, ones // 16-bit x16-x23 + vmull.u8 q13, d29, ones // 16-bit x24-x31 + vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 + vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 + vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler + vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 + vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 + vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler + vadd.u16 q10, q11 // 8 16-bit elements for sum2 + vadd.u16 q8, q12 // 8 16-bit elements for adler + vadd.u16 q9, q14 // 8 16-bit elements for sum2 + vadd.u16 q10, q9 // 8 16-bit elements for sum2 + vpaddl.u16 q8, q8 // 4 32-bit elements for adler + vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 + vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler + vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 + subs n, #1 // --n + vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler + vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input + + bgt do_loop // } while (--n); + + vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) + + vld1.32 {x0_x15},[buf,:128]! // 16-byte input + + vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 + vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 + vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 + vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 + + vadd.u16 q8, q8, q9 // 8 16-bit elements for adler + vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 + vpaddl.u16 q8, q8 // 4 32-bit elements for adler + vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 + vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler + vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 + vadd.u32 adlersum2,adler16 // sum2 += old adler; + vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler + vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input + + // mod(alder,BASE); mod(sum2,BASE); + vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 + vshr.u64 q2,q2,#47 // take the integer part + vpadd.u32 d4,d4,d5 // merge into a double word in d4 + vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE + + subs len, nmax // len -= NMAX; + bge while_len_ge_NMAX_loop // repeat while len >= NMAX + + add len, nmax // post-increment len by NMAX + +len_lessthan_NMAX: + + cmp len, #0 + beq len_is_zero // if len==0, branch to skip the following + + + subs len, #32 // pre-decrement len by 32 + blt len_lessthan_32 // if len < 32, branch to len16_loop + +len32_loop: + + vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2) + vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15 + vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 + vld1.32 {q14}, [buf,:128]! // x16:x31 + vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 + vadd.u32 adlersum2,adler16 // sum2 += old adler*32; + vmull.u8 q12, d28, ones // 16-bit x16-x23 + vmull.u8 q13, d29, ones // 16-bit x24-x31 + vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9 + vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1 + vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler + vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25 + vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17 + vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler + vadd.u16 q10, q11 // 8 16-bit elements for sum2 + vadd.u16 q8, q12 // 8 16-bit elements for adler + vadd.u16 q9, q14 // 8 16-bit elements for sum2 + vadd.u16 q10, q9 // 8 16-bit elements for sum2 + vpaddl.u16 q8, q8 // 4 32-bit elements for adler + vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 + vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler + vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 + subs len, #32 // len -= 32; + vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler + vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input + + bge len32_loop + +len_lessthan_32: + + adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16 + blt len_lessthan_16 // if len < 16, branch to len_lessthan_16 + + vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2) + + vld1.32 {x0_x15},[buf,:128]! // 16-byte input + + + vmull.u8 q8, x0_x7, ones // 16-bit x0-x7 + vmull.u8 q9, x8_x15, ones // 16-bit x8-x15 + vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9 + vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1 + + vadd.u16 q8, q8, q9 // 8 16-bit elements for adler + vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2 + vpaddl.u16 q8, q8 // 4 32-bit elements for adler + vpaddl.u16 q10, q10 // 4 32-bit elements for sum2 + vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler + vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2 + subs len, #16 // decrement len by 16 + vadd.u32 adlersum2,adler16 // sum2 += old adler; + vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler + vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input + +len_lessthan_16: + adds len, #16 // post-increment len by 16 + beq len_is_zero_internal // if len==0, branch to len_is_zero_internal + + // restore adler/sum2 into general registers for remaining (<16) bytes + + vmov sum2, adler, adlersum2 +remaining_len_loop: + ldrb t, [buf], #1 // *buf++; + subs len, #1 // len--; + add adler,t // adler += *buf + add sum2,adler // sum2 += adler + bgt remaining_len_loop // break if len<=0 + + vmov adlersum2, sum2, adler // move to double register for modulo operation + +len_is_zero_internal: + + // mod(alder,BASE); mod(sum2,BASE); + + vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47 + vshr.u64 q2,q2,#47 // take the integer part + vpadd.u32 d4,d4,d5 // merge into a double word in d4 + vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE + +len_is_zero: + + vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler) + add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16); + ldmfd sp!, {r4, r5, pc} // restore registers and return + + + // constants to be loaded into q registers + .align 4 // 16 byte aligned + +vec_table: + + // coefficients for computing sum2 + .long 0x0d0e0f10 // s0 + .long 0x090a0b0c // s1 + .long 0x05060708 // s2 + .long 0x01020304 // s3 + + // coefficients for computing adler + .long 0x01010101 // s4/d2 + .long 0x01010101 // s5 + + .long BASE // s6 : BASE + .long 0x80078071 // s7 : 1/BASE in Q47 + + // q15 : d30.d31 + .long 0x1d1e1f20 // s0 + .long 0x191a1b1c // s1 + .long 0x15161718 // s2 + .long 0x11121314 // s3 + +NMAX_loc: + .long NMAX // NMAX + +#endif // _ARM_ARCH_7 + +#endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) + +#endif // _ARM_ARCH_6 + diff --git a/libkern/zlib/arm/inffastS.s b/libkern/zlib/arm/inffastS.s new file mode 100644 index 000000000..98855795a --- /dev/null +++ b/libkern/zlib/arm/inffastS.s @@ -0,0 +1,571 @@ +#include + +// the follow assembly code was hard wired to POSTINC not defined, + +#if 0 // #ifdef POSTINC +# define OFF 0 +# define PUP(a) *(a)++ +#else +# define OFF 1 +# define PUP(a) *++(a) +#endif + +// the code uses r9, therefore, it does not meet the register protocol for armv5 and below +// the code can only be used for armv6 and above + +#if defined _ARM_ARCH_6 + + .cstring + .align 2 +LC0: + .ascii "invalid distance too far back\0" + .align 2 +LC1: + .ascii "invalid distance code\0" + .align 2 +LC2: + .ascii "invalid literal/length code\0" + + // renaming the register and stack memory use + + #define out r0 + #define strm r10 + #define state r5 + #define in r11 + #define write r9 + #define distcode r8 + #define bits lr + #define hold r4 + + // stack memory allocation + + #define window_loc [sp,#0] + #define last_loc [sp,#4] + #define beg_loc [sp,#8] + #define end_loc [sp,#12] + #define wsize_loc [sp,#16] + #define whave_loc [sp,#20] + #define windowm1_loc [sp,#28] + #define lmask_loc [sp,#32] + #define dmask_loc [sp,#36] + #define dist_loc [sp,#48] + + #define local_size 52 + + // the following defines the variable offset in the inflate_state structure (in inflate.h) + + #define state_mode [state, #0] + #define state_last [state, #4] + #define state_wrap [state, #8] + #define state_havedict [state, #12] + #define state_flags [state, #16] + #define state_dmax [state, #20] + #define state_wbits [state, #36] + #define state_wsize [state, #40] + #define state_whave [state, #44] + #define state_write [state, #48] + #define state_window [state, #52] + #define state_hold [state, #56] + #define state_bits [state, #60] + #define state_lencode [state, #76] + #define state_distcode [state, #80] + #define state_lenbits [state, #84] + #define state_distbits [state, #88] + + +// void inflate_fast(z_streamp strm, unsigned start) +// input : +// r0 = strm, (move to r10) +// r1 = start + + .text + .align 2 + .globl _inflate_fast +_inflate_fast: + + stmfd sp!, {r4-r6,r8-r11,lr} + sub sp, sp, #local_size + +#if defined(_ARM_ARCH_5) + ldrd r2,r3,[r0, #0] // r2 = strm->next_in, r3 = strm->avail_in +#else + ldmia r0, {r2-r3} +#endif + + sub in, r2, #OFF // in = strm->next_in - OFF; + sub r2, #(OFF+5) // next_in -= (OFF+5); + ldr state, [r0, #28] // state = (struct inflate_state FAR *)strm->state; + add r3, r3, r2 // last = next_in - OFF + (avail_in - 5); next_in already updated + mov strm, r0 + str r3, last_loc // store last to release r3 + + ldr r3, [r0, #12] // next_out + ldr r2, [strm, #16] // avail_out + + sub out, r3, #OFF // out = strm->next_out - OFF; r0 is used as out from this point on + + sub r3, r3, #256 // next_out - 256 + rsb r1, r2, r1 // start - avail_out + sub r3, r3, #(1+OFF) // next_out-OFF-257 + add r3, r3, r2 // r3 = end = avail_out + (next_out-OFF) - 257 = avail_out + out - 257 + rsb r2, r1, out // r2 = beg = out - (start - avail_out); +#if defined(_ARM_ARCH_5) + strd r2,r3, beg_loc // store beg/end + ldrd r2,r3, state_wsize // wsize/whave + strd r2,r3, wsize_loc // store wsize/whave + //ldrd r6,hold, state_window // window/hold, hold use r7 + ldr r6, state_window // state->window + ldr hold, state_hold // state->hold + nop +#else + // for architecture < armv5, ldrd/strd is not available + str r2, beg_loc // store beg + str r3, end_loc // store end + ldr r2, state_wsize // state->wsize + ldr r3, state_whave // state->whave + str r2, wsize_loc // store wsize + str r3, whave_loc // store whave + ldr r6, state_window // state->window + ldr hold, state_hold // state->hold +#endif + + ldr ip, state_lencode // lencode + mov r3, #1 // used to derive lmask and dmask + ldr write, state_write // write (r9 from this point on) : window write index + nop + str ip, [sp, #40] // save lencode + sub ip, r6, #1 // window-1 + str r6, window_loc // store window + str ip, windowm1_loc // store window-1 + ldr r2, state_lenbits // lenbits + ldr bits, state_bits // bits, use lr from this point on + ldr distcode, state_distcode// distcode, use r8 + mov r2, r3, asl r2 // (1<lenbits) - 1; + mov r3, r3, asl r12 // (1U << state->distbits) + sub r3, r3, #1 // dmask = (1U << state->distbits) - 1; + +#if defined(_ARM_ARCH_5) + strd r2, r3, lmask_loc // store lmask/dmask +#else + str r2, lmask_loc // lmask + str r3, dmask_loc // dmask +#endif + + // start the do loop decoding literals and length/distances + // until end-of-block or not enough input data or output space + +do_loop: + cmp bits, #15 // bits vs 15 + ldr r1, lmask_loc // lmask + bge bitsge15 // if bits >= 15, skip loading new 16 bits + + // this is a shortcut with the processor reads data in little-endian mode + ldrh r3, [in,#1] // read 2 bytes + add in, #2 // in pointer += 2 + add hold, hold, r3, asl bits // deposit the new 2 bytes into hold + add bits, #16 // bits count += 16 + +bitsge15: + ldr ip, [sp, #40] // restore lencode + and r3, hold, r1 // r3 = hold & lmask + b dolen + +op_not_zero: + + tst r2, #16 // if (op&16) + bne length_base // branch to length_base + + tst r2, #64 // else if (op&64) + bne end_of_block // branch to end_of_block processing + + // 2nd-level length code, this is the part where if ((op & 64) == 0) { ... } + + // this.val + (hold & ((1U << op) - 1)); + // r3 = r1 + hold & ((1< 8-bit code, 8-bit bits, 16-bit val + ldrb r2, [ip,r3,asl #2] // op = (unsigned)(this.bits); + add r3, ip, r3, asl #2 // r3 = this + ldrb ip, [r3, #1] // ip = this.bits + ldrh r1, [r3, #2] // r1 = this.value + cmp r2, #0 // op == 0 ? + + mov hold, hold, lsr ip // hold >>= this.bits + rsb bits, ip, bits // bits -= this.bits + bne op_not_zero // branch to op_not_zero if this.op != 0 + + strb r1, [out, #1]! // PUP(out) = (unsigned char)(this.val); + +do_loop_while: + ldr r1, last_loc // last + ldr r2, end_loc // end + cmp in, r1 // compare in vs last + cmpcc out, r2 // if in < last, compare out vs end + bcc do_loop // if (in < last && out < end) go back to do_loop + +update_state_and_return: + + sub r2, in, bits, lsr #3 // r2 = in - (bits>>3) + + add r3, r2, #OFF // r3 = (in - (bits>>3)) + OFF + str r3, [strm, #0] // strm->next_in = in + OFF; + + add r3, out, #OFF // out + OFF + str r3, [strm, #12] // strm->next_out = out + OFF; + + ldr r3, last_loc // r3 = last + ldr ip, end_loc // ip = end + + cmp r3, r2 // compare last vs in + addhi r3, r3, #5 // if last > in, last +=5 + movls r6, r3 // o.w., r6 = last + rsbls r3, r6, r2 // r3 = in-last + rsbhi r3, r2, r3 // r3 = (last+5) - in + rsbls r3, r3, #5 // r3 = 5 - (in-last); + cmp out, ip // compare out vs end + str r3, [strm, #4] // strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); + movcs r2, ip // if out=end, r3 = end+256 + rsbcs r3, r2, out // if out=end, r3 = end+257 + rsbcs r3, r3, #256 // if out>3) << 3; + rsbcc r3, out, r3 // if out=end, r3 = 257 + (end-out) + str r3, [strm, #16] // strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); + + // hold &= (1U << bits) - 1; + + rsb ip, bits, #32 // 32-bits + ror hold, hold, bits // this is equivalent to hold<<(32-bits) + lsr hold, hold, ip // logical shift right by (32-bits), hold now only keeps the bits LSBs + + str bits, state_bits // state->bits = bits; + str hold, state_hold // state->hold = hold; + + add sp, #local_size // pop out stack memory + ldmfd sp!,{r4-r6,r8-r11,pc} // restore registers and return + +length_base: // r2=op, r1=lmask + ands r2, r2, #15 // op&=15; + mov r6, r1 // len = (unsigned) this.val; + beq op_is_zero // if op==0, branch to op_is_zero + cmp r2, bits // op vs bits + ldrhib r3, [in, #1]! // if (op>bits) r3 = (PUP(in)); + addhi hold, hold, r3, asl bits // if (op>bits) hold += (unsigned long)(PUP(in)) << bits; + + rsb ip, r2, #32 // 32-op + ror r3, hold, r2 // (hold<<(32-op)) + add r6, r1, r3, lsr ip // len += (unsigned)hold & ((1U << op) - 1); + + addhi bits, bits, #8 // if (op>bits) bits += 8; + + mov hold, hold, lsr r2 // hold >>= op; + rsb bits, r2, bits // bits -= op; + +op_is_zero: + cmp bits, #14 + ldrh r3,[in,#1] // if (bits < 15) { 2 (PUP(in)); no condition code for better performance + addls in, #2 // in+=2; + addls hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits; + addls bits, #16 // 2 bits += 8; } + +dodist: + + ldr r2, dmask_loc // r2 = dmask + and r3, hold, r2 // r3 = hold & dmask + mov r2, r3, asl #2 + add r3, r2, distcode // &dcode[hold&dmask]; + ldrb ip, [r2, distcode] // op + ldrh r1, [r3, #2] // dist = (unsigned)(this.val); + tst ip, #16 // op vs 16 + ldrb r3, [r3, #1] // this.bits + mov hold, hold, lsr r3 // hold >>= this.bits; + rsb bits, r3, bits // bits -= this.bits; + bne distance_base // if (op&16) { distance base processing } + tst ip, #64 // + beq second_distance_code // else if ((op&64)==0) branch to 2nd level distance code + + b invalide_distance_code + +check_2nd_level_distance_code: + + tst r2, #64 // check for esle if ((op & 64) == 0) for 2nd level distance code + bne invalide_distance_code + +second_distance_code: + + rsb r2, ip, #32 // 32-op + ror r3, hold, ip // hold<<(32-op) + add r3, r1, r3, lsr r2 // this.val + (hold & ((1U << op) - 1)) + + mov r2, r3, asl #2 + add r3, r2, distcode // this = dcode[this.val + (hold & ((1U << op) - 1))]; + ldrb r2, [r2, distcode] // this.op + ldrh r1, [r3, #2] // this.val + + tst r2, #16 // op&16 + ldrb r3, [r3, #1] // this.bits + mov ip, r2 // op + mov hold, hold, lsr r3 // hold >> = this.bits + rsb bits, r3, bits // bits -= this.bits + beq check_2nd_level_distance_code + +distance_base: // this is invoked from if ((op&16)!=0) + + and r2, ip, #15 // op &= 15; + cmp r2, bits // op vs bits + ldrhib r3, [in, #1]! // if (op > bits) (PUP(in)) + addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits; + addhi bits, bits, #8 // bits += 8; + cmphi r2, bits // internel (bits < op) + ldrhib r3, [in, #1]! // if (op > bits) (PUP(in)) + addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits; + + rsb ip, r2, #32 // (32-op) + ror r3, hold, r2 // hold<<(32-op) + add r3, r1, r3, lsr ip // dist += (unsigned)hold & ((1U << op) - 1); + + ldr ip, beg_loc // beg + +#ifdef INFLATE_STRICT + ldr r1, state_dmax // r1 = dmax +#endif + + str r3, dist_loc // save dist + +#ifdef INFLATE_STRICT + cmp r3, r1 // dist vs dmax + bgt invalid_distance_too_far_back // if dist > dmax, set up msg/mode = bad and break +#endif + + ldr r1, dist_loc // dist + rsb r3, ip, out // (out - beg); + addhi bits, bits, #8 // this is the internel bits += 8 from above + + cmp r1, r3 // dist vs (out - beg) + + mov hold, hold, lsr r2 // hold >>= op ; + rsb bits, r2, bits // bits -= op; + rsbls r2, r1, out // if (dist<=op) r2 = from = out-dist + bls copy_direct_from_output // if (dist<=op) branch to copy_direct_from_output + + ldr r2, whave_loc // whave + rsb r1, r3, r1 // op = dist-op + cmp r2, r1 // whave vs op + nop // pad dummy for better performance + bcc invalid_distance_too_far_back // if whave < op, message invalid distance too far back, and break + + cmp write, #0 // write + bne non_very_common_case // if (write ==0) non_very_common_case + + // the following : if (write == 0) { /* very common case */ } + nop // pad dummy for better performance + ldr ip, wsize_loc // wsize + cmp r6, r1 // len vs op + rsb r3, r1, ip // wsize - op + ldr ip, windowm1_loc // window - 1 + add r2, ip, r3 // from = window - 1 + wsize - op : setup for using PUP(from) + movhi r3, r1 // if len > op, r3 = op + movhi r1, out // if len > op, r1 = out + bhi some_from_window // if (len > op), branch to some_from_window + +finish_copy: + + // while (len > 2) { + // PUP(out) = PUP(from); + // PUP(out) = PUP(from); + // PUP(out) = PUP(from); + // len -= 3; + // } + // if (len) { + // PUP(out) = PUP(from); + // if (len > 1) + // PUP(out) = PUP(from); + // } + + cmp r6, #2 // len > 2 ? + movls r1, r6 // if (len<=2) r1 = len + bls lenle2 // if (len<=2) branch to lenle2 + mov r1, r6 +fcopy_per3bytes: + ldrb r3, [r2, #1] // 1st PUP(from) + sub r1, r1, #3 // len-=3 + cmp r1, #2 // len > 2 ? + strb r3, [out, #1] // 1st PUP(out) = PUP(from); + ldrb r3, [r2, #2] // 2nd PUP(from) + add r2, r2, #3 // from+=3 + strb r3, [out, #2] // 2nd PUP(out) = PUP(from); + ldrb r3, [r2, #0] // 3rd PUP(from) + add out, out, #3 // out+=3 + strb r3, [out, #0] // 3rd PUP(out) = PUP(from); + bgt fcopy_per3bytes // while (len>3) back to loop head +lenle2: + cmp r1, #0 // len + beq do_loop_while // back to while loop head if len==0 + ldrb r3, [r2, #1] // PUP(from) + cmp r1, #2 // check whether len==2 + strb r3, [out, #1]! // PUP(out) = PUP(from); + bne do_loop_while // back to while loop head if len==1 + ldrb r3, [r2, #2] // 2nd PUP(from) + strb r3, [out, #1]! // 2nd PUP(out) = PUP(from); + b do_loop_while // back to while loop head + +end_of_block: + tst r2, #32 // if (op&32) + movne r3, #11 // TYPE? + strne r3, state_mode // state-mode = TYPE + bne update_state_and_return // break the do loop and branch to get ready to return + ldr r3, messages // "invalid literal/length code" message +L75: + add r3, pc, r3 + str r3, [strm, #24] // strm->msg = (char *)"invalid literal/length code"; + mov r3, #27 // BAD? + str r3, state_mode // state->mode = BAD; + b update_state_and_return // break the do loop and branch to get ready to return + +//Read_2_bytes: +// ldrh r3,[in,#1] // 2 (PUP(in)) together +// add in, #2 // 2 in++ +// add hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits; +// add bits, #16 // 2 bits += 8; +// b dodist // branch to dodist + nop // a pad dummy instruction to give better performance + +copy_direct_from_output: // r2 = from = out - dist ; + + // do { + ldrb r3, [r2, #1] // 1st PUP(from) + sub r6, r6, #3 // len-=3 + cmp r6, #2 // len vs 2 + strb r3, [out, #1] // 1st PUP(out) = PUP(from); + ldrb r3, [r2, #2] // 2nd PUP(from) + add r2, r2, #3 // update from+=3 + strb r3, [out, #2] // 2nd PUP(out) = PUP(from); + ldrb r3, [r2, #0] // 3rd PUP(from); + add out, out, #3 // update out+=3 + strb r3, [out, #0] // 3rd PUP(out) = PUP(from); + bhi copy_direct_from_output // while (len>2); + + // len in r6 can now be 0 1 or 2 + + subs r6,#1 // len--; + ldrb r3, [r2, #1] // PUP(from) + blt do_loop_while // if len<0 back to while loop head + strb r3, [out, #1]! // PUP(out) = PUP(from); + subs r6, #1 // len--; + ldrb r3, [r2, #2] // 2nd PUP(from) + blt do_loop_while // if len<0 back to while loop head + strb r3, [out, #1]! // 2nd PUP(out) = PUP(from); + b do_loop_while // back to while loop head + + +invalide_distance_code: + ldr r3, messages+4 // "invalid distance code" +L72: + add r3, pc, r3 + str r3, [strm, #24] // strm->msg = (char *)"invalid distance code"; + mov r3, #27 + str r3, state_mode // state->mode = BAD; + b update_state_and_return // break, restore registers, and return + + +some_from_window: + add out, r3, out // out += op + rsb r6, r3, r6 // len -= op +some_from_window_loop: // do { + ldrb ip, [r2, #1]! // PUP(from); + subs r3, r3, #1 // --op + strb ip, [r1, #1]! // PUP(out) = PUP(from); + bne some_from_window_loop // } while(op); + ldr r3, dist_loc // dist + rsb r2, r3, out // from = out - dist; + b finish_copy + +non_very_common_case: + cmp write, r1 // write vs op + nop // pad dummy for better performance + bcs contiguous_in_window // if (write >= op) branch to contiguous_in_window + + /* wrap around window */ + + ldr r2, wsize_loc // wsize + ldr ip, windowm1_loc // window-1 + add r3, write, r2 // r3 = wsize+write + rsb r3, r1, r3 // r3 = wsize+write-op + add r2, ip, r3 // r2 = from = wsize+write-op+window-1; + rsb r1, write, r1 // op -= write; + + cmp r6, r1 // len vs op + bls finish_copy // if (len <= op) branch to finish_copy + rsb r6, r1, r6 // len -= op +waw_loop: // do { + ldrb r3, [r2, #1]! // PUP(from) + subs r1, r1, #1 // --op; + strb r3, [out, #1]! // PUP(out) = PUP(from); + bne waw_loop // } while (op); + + cmp write, r6 // write vs len + ldrcs r2, windowm1_loc // if (write>=len) r2 = from = window-1; + bcs finish_copy // if (write>=len) branch to finish_copy + + // some from start of window + + mov r1, write // op = write + sub r6, write // len -= op + sub ip, out + add ip, #1 // out+ip -> from +sow_loop: // do { + ldrb r3,[out, ip] // PUP(from) + subs r1, #1 // --op; + strb r3, [out,#1]! // PUP(out) = PUP(from); + bne sow_loop // } while (op); + + ldr r2, dist_loc // dist + sub r6, r6, write // len -= write + rsb r2, r2, out // r2 = from = out-dist + b finish_copy // continue to finish_copy + + +contiguous_in_window: + ldr ip, windowm1_loc // window-1 + cmp r6, r1 // len vs op + rsb r3, r1, write // r3 = write-op + add r2, ip, r3 // r2 = from = window+write-op-1 + bls finish_copy // if (len <= op) branch to finish_copy + rsb r6, r1, r6 // len -= op + ldr r3, dist_loc // dist +ciw_loop: + ldrb ip, [r2, #1]! // PUP(from) + subs r1, r1, #1 // op-- + strb ip, [out, #1]! // PUP(out) = PUP(from); + bne ciw_loop // while (--op); + rsb r2, r3, out // from = out - dist; + b finish_copy + +invalid_distance_too_far_back: + ldr r3, messages+8 // "invalid distance too far back" +L42: + add r3, pc, r3 + str r3, [strm, #24] // strm->msg = (char *)"invalid distance too far back"; + mov r3, #27 + str r3, state_mode // state->mode = BAD; + b update_state_and_return // break, restore registers, and return + + .align 2 +messages: + .long LC2-8-(L75) + .long LC1-8-(L72) + .long LC0-8-(L42) + +#endif // defined _ARM_ARCH_6 diff --git a/libkern/zlib/inffast.c b/libkern/zlib/inffast.c index 82d2795c0..54f0ee815 100644 --- a/libkern/zlib/inffast.c +++ b/libkern/zlib/inffast.c @@ -30,6 +30,14 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ + +#if defined _ARM_ARCH_6 + + // dummy definition, for armv6 or above, compile code from inffastS.s + typedef char DummyDefinition; + +#else // architecture + #include "zutil.h" #include "inftrees.h" #include "inflate.h" @@ -343,3 +351,5 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ */ #endif /* !ASMINF */ + +#endif // architecture diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index 0366b6215..60c9bee2f 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -250,7 +250,7 @@ ifeq (-arch armv6,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif ifeq (-arch armv5,$(ARCH_FLAGS_ARM)) -CFLAGS_ARM += -mthumb +CFLAGS_ARM += -mno-thumb endif ifeq (-arch xscale,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb @@ -394,7 +394,8 @@ export LDFLAGS_KERNEL_ARM = \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ -Wl,-segaddr,__HIB,0xC0000000 \ - -Wl,-image_base,0xC0008000 + -Wl,-image_base,0xC0008000 \ + -Wl,-exported_symbols_list,$(TARGET)/kernel-kpi.exp export LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 618a7849f..3ba713083 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -570,6 +570,7 @@ do_build_mach_kernel: $(TARGET)/kgmacros $(TARGET)/mach_kernel $(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) lastkernelconstructor.o $(_v)${MAKE} version.o + $(_v)${MAKE} build_mach_kernel_exports @echo LD mach_kernel.sys $(_v)$(CAT) $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) > mach_kernel.filelist $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist mach_kernel.filelist version.o lastkernelconstructor.o `if [ -e $(STATIC_KMODS) ]; then echo $(STATIC_KMODS); fi` \ @@ -606,6 +607,14 @@ lastkernelconstructor.o: $(SRCROOT)/libsa/lastkernelconstructor.c $(TARGET)/kgmacros: $(SRCROOT)/kgmacros $(_v)$(INSTALL) $(INSTALL_FLAGS) $? $@ +.PHONY: build_mach_kernel_exports +build_mach_kernel_exports: + $(_v)${MAKE} \ + MAKEFILES=${SOURCE}/config/Makefile \ + SOURCE=${SOURCE}/config \ + TARGET=$${TARGET} \ + build_mach_kernel_exports; + # Special rules to install machine configuration variants $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel force_file_install diff --git a/osfmk/conf/MASTER b/osfmk/conf/MASTER index 76e39eb65..cadb1a976 100644 --- a/osfmk/conf/MASTER +++ b/osfmk/conf/MASTER @@ -230,6 +230,10 @@ options CONFIG_EMBEDDED # # options CONFIG_ENFORCE_SIGNED_CODE # +# support dynamic signing of code +# +options CONFIG_DYNAMIC_CODE_SIGNING # + # vc_progress_white - make the progress gear white instead of black options CONFIG_VC_PROGRESS_WHITE # diff --git a/osfmk/console/panic_dialog.c b/osfmk/console/panic_dialog.c index 29fa14890..ca0acb6d1 100644 --- a/osfmk/console/panic_dialog.c +++ b/osfmk/console/panic_dialog.c @@ -35,6 +35,7 @@ #include #include #include +#include extern struct vc_info vinfo; extern boolean_t panicDialogDesired; @@ -51,7 +52,6 @@ static int panic_dialog_verify( const struct panicimage * data, unsigned int siz static int pixels_needed_to_blit_digit( int digit ); static void blit_digit( int digit ); static const char * strnstr(const char * s, const char * find, size_t slen); -void dim_screen(void); static void panic_blit_rect(unsigned int x, unsigned int y, unsigned int width, unsigned int height, int transparent, const unsigned char * dataPtr); @@ -839,40 +839,6 @@ decode_rle(const unsigned char *dataPtr, unsigned int *quantity, } -void -dim_screen(void) -{ - unsigned int *p, *endp, *row; - int col, rowline, rowlongs; - register unsigned int mask; - - if(!vinfo.v_depth) - return; - - if ( vinfo.v_depth == 32 ) - mask = 0x007F7F7F; - else if ( vinfo.v_depth == 30 ) - mask = (0x1ff<<20) | (0x1ff<<10) | 0x1ff; - else if ( vinfo.v_depth == 16 ) - mask = 0x3DEF3DEF; - else - return; - - rowline = (int)(vinfo.v_rowscanbytes / 4); - rowlongs = (int)(vinfo.v_rowbytes / 4); - - p = (unsigned int*) vinfo.v_baseaddr; - endp = p + (rowlongs * vinfo.v_height); - - for (row = p ; row < endp ; row += rowlongs) { - for (p = &row[0], col = 0; col < rowline; col++) { - *p = (*p >> 1) & mask; - ++p; - } - } -} - - /* From user mode Libc - this ought to be in a library */ static const char * strnstr(const char * s, const char * find, size_t slen) diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index 49dc6da91..8c0dc3bf2 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -2506,6 +2506,39 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) #endif /* GRATEFULDEBUGGER */ } +void +dim_screen(void) +{ + unsigned int *p, *endp, *row; + int col, rowline, rowlongs; + register unsigned int mask; + + if(!vinfo.v_depth) + return; + + if ( vinfo.v_depth == 32 ) + mask = 0x007F7F7F; + else if ( vinfo.v_depth == 30 ) + mask = (0x1ff<<20) | (0x1ff<<10) | 0x1ff; + else if ( vinfo.v_depth == 16 ) + mask = 0x3DEF3DEF; + else + return; + + rowline = (int)(vinfo.v_rowscanbytes / 4); + rowlongs = (int)(vinfo.v_rowbytes / 4); + + p = (unsigned int*) vinfo.v_baseaddr; + endp = p + (rowlongs * vinfo.v_height); + + for (row = p ; row < endp ; row += rowlongs) { + for (p = &row[0], col = 0; col < rowline; col++) { + *p = (*p >> 1) & mask; + ++p; + } + } +} + void vcattach(void); /* XXX gcc 4 warning cleanup */ void diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 204a85ab6..c623ba72e 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -155,7 +155,7 @@ typedef struct _cframe_t { static unsigned panic_io_port; static unsigned commit_paniclog_to_nvram; -int debug_boot_arg; +unsigned int debug_boot_arg; void machine_startup(void) @@ -167,13 +167,14 @@ machine_startup(void) halt_in_debugger = halt_in_debugger ? 0 : 1; #endif - if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) { - if (boot_arg & DB_HALT) halt_in_debugger=1; - if (boot_arg & DB_PRT) disable_debug_output=FALSE; - if (boot_arg & DB_SLOG) systemLogDiags=TRUE; - if (boot_arg & DB_NMI) panicDebugging=TRUE; - if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; - debug_boot_arg = boot_arg; + if (PE_parse_boot_argn("debug", &debug_boot_arg, sizeof (debug_boot_arg))) { + if (debug_boot_arg & DB_HALT) halt_in_debugger=1; + if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE; + if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE; + if (debug_boot_arg & DB_NMI) panicDebugging=TRUE; + if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; + } else { + debug_boot_arg = 0; } if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram))) @@ -714,13 +715,11 @@ panic_io_port_read(void) { /* For use with the MP rendezvous mechanism */ -#if !CONFIG_EMBEDDED static void machine_halt_cpu(__unused void *arg) { panic_io_port_read(); pmCPUHalt(PM_HALT_DEBUG); } -#endif void Debugger( @@ -762,7 +761,7 @@ Debugger( #endif /* Print backtrace - callee is internally synchronized */ - panic_i386_backtrace(stackptr, 20, NULL, FALSE, NULL); + panic_i386_backtrace(stackptr, 32, NULL, FALSE, NULL); /* everything should be printed now so copy to NVRAM */ @@ -819,23 +818,28 @@ Debugger( } } } - draw_panic_dialog(); + + /* If the user won't be able to read the dialog, + * don't bother trying to show it + */ + if (!PE_reboot_on_panic()) + draw_panic_dialog(); if (!panicDebugging) { /* Clear the MP rendezvous function lock, in the event * that a panic occurred while in that codepath. */ mp_rendezvous_break_lock(); -#if CONFIG_EMBEDDED - PEHaltRestart(kPEPanicRestartCPU); -#else + if (PE_reboot_on_panic()) { + PEHaltRestart(kPEPanicRestartCPU); + } + /* Force all CPUs to disable interrupts and HLT. * We've panicked, and shouldn't depend on the * PEHaltRestart() mechanism, which relies on several * bits of infrastructure. */ mp_rendezvous_no_intrs(machine_halt_cpu, NULL); -#endif /* NOT REACHED */ } } diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index 470e8a3e7..58791ecb8 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -52,7 +52,6 @@ #define k64Bit 0x00000200 /* processor supports EM64T (not what mode you're running in) */ #define kHasSSE4_1 0x00000400 #define kHasSSE4_2 0x00000800 -#define kHasAES 0x00001000 #define kInOrderPipeline 0x00002000 /* in-order execution */ #define kSlow 0x00004000 /* tsc < nanosecond */ #define kUP 0x00008000 /* set if (kNumCPUs == 1) */ diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index c247a157d..1ddb1469e 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -573,6 +573,7 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) cpuid_fn(6, reg); ctp->sensor = bitfield32(reg[eax], 0, 0); ctp->dynamic_acceleration = bitfield32(reg[eax], 1, 1); + ctp->invariant_APIC_timer = bitfield32(reg[eax], 2, 2); ctp->thresholds = bitfield32(reg[ebx], 3, 0); ctp->ACNT_MCNT = bitfield32(reg[ecx], 0, 0); info_p->cpuid_thermal_leafp = ctp; @@ -727,9 +728,9 @@ static struct { extfeature_map[] = { {CPUID_EXTFEATURE_SYSCALL, "SYSCALL"}, {CPUID_EXTFEATURE_XD, "XD"}, + {CPUID_EXTFEATURE_RDTSCP, "RDTSCP"}, {CPUID_EXTFEATURE_EM64T, "EM64T"}, {CPUID_EXTFEATURE_LAHF, "LAHF"}, - {CPUID_EXTFEATURE_RDTSCP, "RDTSCP"}, {CPUID_EXTFEATURE_TSCI, "TSCI"}, {0, 0} }; diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 135ededc3..32b07e12a 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -84,6 +84,7 @@ #define CPUID_FEATURE_PBE _Bit(31) /* Pend Break Enable */ #define CPUID_FEATURE_SSE3 _HBit(0) /* Streaming SIMD extensions 3 */ + #define CPUID_FEATURE_MONITOR _HBit(3) /* Monitor/mwait */ #define CPUID_FEATURE_DSCPL _HBit(4) /* Debug Store CPL */ #define CPUID_FEATURE_VMX _HBit(5) /* VMX */ @@ -95,6 +96,7 @@ #define CPUID_FEATURE_CX16 _HBit(13) /* CmpXchg16b instruction */ #define CPUID_FEATURE_xTPR _HBit(14) /* Send Task PRiority msgs */ #define CPUID_FEATURE_PDCM _HBit(15) /* Perf/Debug Capability MSR */ + #define CPUID_FEATURE_DCA _HBit(18) /* Direct Cache Access */ #define CPUID_FEATURE_SSE4_1 _HBit(19) /* Streaming SIMD extensions 4.1 */ #define CPUID_FEATURE_SSE4_2 _HBit(20) /* Streaming SIMD extensions 4.2 */ @@ -108,10 +110,11 @@ */ #define CPUID_EXTFEATURE_SYSCALL _Bit(11) /* SYSCALL/sysret */ #define CPUID_EXTFEATURE_XD _Bit(20) /* eXecute Disable */ + #define CPUID_EXTFEATURE_RDTSCP _Bit(27) /* RDTSCP */ #define CPUID_EXTFEATURE_EM64T _Bit(29) /* Extended Mem 64 Technology */ -#define CPUID_EXTFEATURE_LAHF _HBit(20) /* LAFH/SAHF instructions */ +#define CPUID_EXTFEATURE_LAHF _HBit(0) /* LAHF/SAHF instructions */ /* * The CPUID_EXTFEATURE_XXX values define 64-bit values @@ -128,7 +131,6 @@ #define CPUID_MODEL_MEROM 15 #define CPUID_MODEL_PENRYN 23 #define CPUID_MODEL_NEHALEM 26 -#define CPUID_MODEL_ATOM 28 #define CPUID_MODEL_FIELDS 30 /* Lynnfield, Clarksfield, Jasper */ #define CPUID_MODEL_DALES 31 /* Havendale, Auburndale */ #define CPUID_MODEL_NEHALEM_EX 46 @@ -200,6 +202,7 @@ typedef struct { typedef struct { boolean_t sensor; boolean_t dynamic_acceleration; + boolean_t invariant_APIC_timer; uint32_t thresholds; boolean_t ACNT_MCNT; } cpuid_thermal_leaf_t; diff --git a/osfmk/i386/lapic.c b/osfmk/i386/lapic.c index 0206d0986..21e974bff 100644 --- a/osfmk/i386/lapic.c +++ b/osfmk/i386/lapic.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -90,12 +91,11 @@ static unsigned lapic_master_error_count = 0; static unsigned lapic_error_count_threshold = 5; static boolean_t lapic_dont_panic = FALSE; -extern int debug_boot_arg; - /* Base vector for local APIC interrupt sources */ int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; -int lapic_to_cpu[MAX_CPUS]; +#define MAX_LAPICIDS (LAPIC_ID_MAX+1) +int lapic_to_cpu[MAX_LAPICIDS]; int cpu_to_lapic[MAX_CPUS]; static void @@ -103,15 +103,17 @@ lapic_cpu_map_init(void) { int i; - for (i = 0; i < MAX_CPUS; i++) { - lapic_to_cpu[i] = -1; + for (i = 0; i < MAX_CPUS; i++) cpu_to_lapic[i] = -1; - } + for (i = 0; i < MAX_LAPICIDS; i++) + lapic_to_cpu[i] = -1; } void lapic_cpu_map(int apic_id, int cpu) { + assert(apic_id < MAX_LAPICIDS); + assert(cpu < MAX_CPUS); cpu_to_lapic[cpu] = apic_id; lapic_to_cpu[apic_id] = cpu; } @@ -137,7 +139,7 @@ ml_get_apicid(uint32_t cpu) uint32_t ml_get_cpuid(uint32_t lapic_index) { - if(lapic_index >= (uint32_t)MAX_CPUS) + if(lapic_index >= (uint32_t)MAX_LAPICIDS) return 0xFFFFFFFF; /* Return -1 if cpu too big */ /* Return the cpu ID (or -1 if not configured) */ @@ -158,7 +160,7 @@ lapic_cpu_map_dump(void) kprintf("cpu_to_lapic[%d]: %d\n", i, cpu_to_lapic[i]); } - for (i = 0; i < MAX_CPUS; i++) { + for (i = 0; i < MAX_LAPICIDS; i++) { if (lapic_to_cpu[i] == -1) continue; kprintf("lapic_to_cpu[%d]: %d\n", diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c index 6df816c55..27f1fb52b 100644 --- a/osfmk/i386/loose_ends.c +++ b/osfmk/i386/loose_ends.c @@ -786,10 +786,6 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, } pmap = thread->map->pmap; -#if CONFIG_DTRACE - thread->machine.specFlags |= CopyIOActive; -#endif /* CONFIG_DTRACE */ - if (pmap == kernel_pmap || use_kernel_map) { kern_vaddr = (vm_offset_t)user_addr; @@ -819,13 +815,18 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr, (unsigned)kernel_addr, (unsigned)nbytes, error | 0x80000000, 0); + return (error); + } #if CONFIG_DTRACE - thread->machine.specFlags &= ~CopyIOActive; + thread->machine.specFlags |= CopyIOActive; #endif /* CONFIG_DTRACE */ - return (error); + if ((nbytes && (user_addr + nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map))) { + error = EFAULT; + goto done; } + user_base = user_addr & ~((user_addr_t)(NBPDE - 1)); user_offset = (vm_offset_t)(user_addr & (NBPDE - 1)); @@ -1029,6 +1030,8 @@ copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) } window_offset = (char *)((uint32_t)paddr & (PAGE_SIZE - 1)); + assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0))); + if (current_thread()->machine.physwindow_busy) { pt_entry_t old_pentry; diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 56fe44b17..0efbb917c 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -113,8 +113,8 @@ machine_idle(void) if (pmInitDone && pmDispatch != NULL - && pmDispatch->cstateMachineIdle != NULL) - (*pmDispatch->cstateMachineIdle)(0x7FFFFFFFFFFFFFFFULL); + && pmDispatch->MachineIdle != NULL) + (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL); else { /* * If no power management, re-enable interrupts and halt. @@ -562,8 +562,10 @@ machine_run_count(uint32_t count) } boolean_t -machine_cpu_is_inactive(int cpu) +machine_processor_is_inactive(processor_t processor) { + int cpu = processor->cpu_id; + if (pmDispatch != NULL && pmDispatch->pmIsCPUUnAvailable != NULL) return(pmDispatch->pmIsCPUUnAvailable(cpu_to_lcpu(cpu))); @@ -571,6 +573,43 @@ machine_cpu_is_inactive(int cpu) return(FALSE); } +processor_t +machine_choose_processor(processor_set_t pset, + processor_t preferred) +{ + int startCPU; + int endCPU; + int preferredCPU; + int chosenCPU; + + if (!pmInitDone) + return(preferred); + + if (pset == NULL) { + startCPU = -1; + endCPU = -1; + } else { + startCPU = pset->cpu_set_low; + endCPU = pset->cpu_set_hi; + } + + if (preferred == NULL) + preferredCPU = -1; + else + preferredCPU = preferred->cpu_id; + + if (pmDispatch != NULL + && pmDispatch->pmChooseCPU != NULL) { + chosenCPU = pmDispatch->pmChooseCPU(startCPU, endCPU, preferredCPU); + + if (chosenCPU == -1) + return(NULL); + return(cpu_datap(chosenCPU)->cpu_processor); + } + + return(preferred); +} + static uint32_t pmGetSavedRunCount(void) { diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index 5609df50e..ff67de670 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -38,7 +38,7 @@ * This value should be changed each time that pmDsipatch_t or pmCallBacks_t * changes. */ -#define PM_DISPATCH_VERSION 18 +#define PM_DISPATCH_VERSION 19 /* * Dispatch table for functions that get installed when the power @@ -54,7 +54,7 @@ typedef struct { int (*pmCPUStateInit)(void); void (*cstateInit)(void); - uint64_t (*cstateMachineIdle)(uint64_t maxIdleDuration); + uint64_t (*MachineIdle)(uint64_t maxIdleDuration); uint64_t (*GetDeadline)(x86_lcpu_t *lcpu); uint64_t (*SetDeadline)(x86_lcpu_t *lcpu, uint64_t); void (*Deadline)(x86_lcpu_t *lcpu); @@ -75,6 +75,7 @@ typedef struct void (*markAllCPUsOff)(void); void (*pmSetRunCount)(uint32_t count); boolean_t (*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu); + int (*pmChooseCPU)(int startCPU, int endCPU, int preferredCPU); int (*pmIPIHandler)(void *state); } pmDispatch_t; diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index 311763f1f..e7135803a 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -89,7 +89,6 @@ */ #include -#include #include #include @@ -219,143 +218,10 @@ boolean_t pmap_trace = FALSE; uint64_t max_preemption_latency_tsc = 0; -/* - * Private data structures. - */ - -/* - * For each vm_page_t, there is a list of all currently - * valid virtual mappings of that page. An entry is - * a pv_rooted_entry_t; the list is the pv_table. - * - * N.B. with the new combo rooted/hashed scheme it is - * only possibly to remove individual non-rooted entries - * if they are found via the hashed chains as there is no - * way to unlink the singly linked hashed entries if navigated to - * via the queue list off the rooted entries. Think of it as - * hash/walk/pull, keeping track of the prev pointer while walking - * the singly linked hash list. All of this is to save memory and - * keep both types of pv_entries as small as possible. - */ - -/* - -PV HASHING Changes - JK 1/2007 - -Pve's establish physical to virtual mappings. These are used for aliasing of a -physical page to (potentially many) virtual addresses within pmaps. In the previous -implementation the structure of the pv_entries (each 16 bytes in size) was - -typedef struct pv_entry { - struct pv_entry_t next; - pmap_t pmap; - vm_map_offset_t va; -} *pv_entry_t; - -An initial array of these is created at boot time, one per physical page of memory, -indexed by the physical page number. Additionally, a pool of entries is created from a -pv_zone to be used as needed by pmap_enter() when it is creating new mappings. -Originally, we kept this pool around because the code in pmap_enter() was unable to -block if it needed an entry and none were available - we'd panic. Some time ago I -restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing -a pv structure and restart, removing a panic from the code (in the case of the kernel -pmap we cannot block and still panic, so, we keep a separate hot pool for use only on -kernel pmaps). The pool has not been removed since there is a large performance gain -keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need. - -As pmap_enter() created new mappings it linked the new pve's for them off the fixed -pv array for that ppn (off the next pointer). These pve's are accessed for several -operations, one of them being address space teardown. In that case, we basically do this - - for (every page/pte in the space) { - calc pve_ptr from the ppn in the pte - for (every pv in the list for the ppn) { - if (this pv is for this pmap/vaddr) { - do housekeeping - unlink/free the pv - } - } - } - -The problem arose when we were running, say 8000 (or even 2000) apache or other processes -and one or all terminate. The list hanging off each pv array entry could have thousands of -entries. We were continuously linearly searching each of these lists as we stepped through -the address space we were tearing down. Because of the locks we hold, likely taking a cache -miss for each node, and interrupt disabling for MP issues the system became completely -unresponsive for many seconds while we did this. - -Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn -for operations like pmap_page_protect and finding and modifying/removing a single pve as -part of pmap_enter processing) has led to modifying the pve structures and databases. - -There are now two types of pve structures. A "rooted" structure which is basically the -original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a -hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of -minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of -pages in the system are not aliased and hence represented by a single pv entry I've kept -the rooted entry size as small as possible because there is one of these dedicated for -every physical page of memory. The hashed pve's are larger due to the addition of the hash -link and the ppn entry needed for matching while running the hash list to find the entry we -are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs) -will pay the extra memory price. Both structures have the same first three fields allowing -some simplification in the code. - -They have these shapes - -typedef struct pv_rooted_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; -} *pv_rooted_entry_t; - - -typedef struct pv_hashed_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; -} *pv_hashed_entry_t; - -The main flow difference is that the code is now aware of the rooted entry and the hashed -entries. Code that runs the pv list still starts with the rooted entry and then continues -down the qlink onto the hashed entries. Code that is looking up a specific pv entry first -checks the rooted entry and then hashes and runs the hash list for the match. The hash list -lengths are much smaller than the original pv lists that contained all aliases for the specific ppn. - -*/ - -typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; /* virtual address for mapping */ - pmap_t pmap; /* pmap where mapping lies */ -} *pv_rooted_entry_t; - -#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) - -pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ - -typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; -} *pv_hashed_entry_t; - -#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) - -#define NPVHASH 4095 /* MUST BE 2^N - 1 */ pv_hashed_entry_t *pv_hash_table; /* hash lists */ uint32_t npvhash = 0; -/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */ -#ifdef PV_DEBUG -#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); -#else -#define CHK_NPVHASH() -#endif /* * pv_list entries are kept on a list that can only be accessed @@ -373,53 +239,6 @@ int pv_free_count = 0; int pv_hashed_free_count = 0; int pv_kern_free_count = 0; int pv_hashed_kern_free_count = 0; -#define PV_HASHED_LOW_WATER_MARK 5000 -#define PV_HASHED_KERN_LOW_WATER_MARK 100 -#define PV_HASHED_ALLOC_CHUNK 2000 -#define PV_HASHED_KERN_ALLOC_CHUNK 50 -thread_call_t mapping_adjust_call; -static thread_call_data_t mapping_adjust_call_data; -uint32_t mappingrecurse = 0; - -#define PV_HASHED_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_free_list_lock); \ - if ((pvh_e = pv_hashed_free_list) != 0) { \ - pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_free_count--; \ - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_free_list_lock); \ -} - -#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ - pv_hashed_free_list = pvh_eh; \ - pv_hashed_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_free_list_lock); \ -} - -#define PV_HASHED_KERN_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ - pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_kern_free_count--; \ - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ -} - -#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ - pv_hashed_kern_free_list = pvh_eh; \ - pv_hashed_kern_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ -} zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ @@ -447,23 +266,6 @@ boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */ static struct vm_object kptobj_object_store; static vm_object_t kptobj; -/* - * Index into pv_head table, its lock bits, and the modify/reference and managed bits - */ - -#define pa_index(pa) (i386_btop(pa)) -#define ppn_to_pai(ppn) ((int)ppn) - -#define pai_to_pvh(pai) (&pv_head_table[pai]) -#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) -#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) - -#define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash) -#define pvhash(idx) (&pv_hash_table[idx]) - -#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) -#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) - /* * Array of physical page attribites for managed pages. * One byte per physical page. @@ -596,44 +398,6 @@ static int nkpt; pt_entry_t *DMAP1, *DMAP2; caddr_t DADDR1; caddr_t DADDR2; - -static inline -void pmap_pvh_unlink(pv_hashed_entry_t pv); - -/* - * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. - * properly deals with the anchor. - * must be called with the hash locked, does not unlock it - */ - -static inline -void pmap_pvh_unlink(pv_hashed_entry_t pvh) -{ - pv_hashed_entry_t curh; - pv_hashed_entry_t *pprevh; - int pvhash_idx; - - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh->pmap, pvh->va); - - pprevh = pvhash(pvhash_idx); - -#if PV_DEBUG - if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */ -#endif - curh = *pprevh; - - while (PV_HASHED_ENTRY_NULL != curh) { - if (pvh == curh) - break; - pprevh = &curh->nexth; - curh = curh->nexth; - } - if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); - *pprevh = pvh->nexth; - return; -} - /* * for legacy, returns the address of the pde entry. * for 64 bit, causes the pdpt page containing the pde entry to be mapped, @@ -1550,7 +1314,7 @@ pmap_create( va = (vm_offset_t)p->dirbase; p->pdirbase = kvtophys(va); - template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID; + template = INTEL_PTE_VALID; for (i = 0; i< NPGPTD; i++, pdpt++ ) { pmap_paddr_t pa; pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i))); @@ -1588,7 +1352,7 @@ pmap_create( /* uber space points to uber mapped kernel */ s = splhigh(); pml4p = pmap64_pml4(p, 0ULL); - pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4); + pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX), *kernel_pmap->pm_pml4); if (!is_64bit) { @@ -1815,231 +1579,6 @@ pmap_reference( } } -/* - * Remove a range of hardware page-table entries. - * The entries given are the first (inclusive) - * and last (exclusive) entries for the VM pages. - * The virtual address is the va for the first pte. - * - * The pmap must be locked. - * If the pmap is not the kernel pmap, the range must lie - * entirely within one pte-page. This is NOT checked. - * Assumes that the pte-page exists. - */ - -void -pmap_remove_range( - pmap_t pmap, - vm_map_offset_t start_vaddr, - pt_entry_t *spte, - pt_entry_t *epte) -{ - register pt_entry_t *cpte; - pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_e; - int pvh_cnt = 0; - int num_removed, num_unwired, num_found; - int pai; - pmap_paddr_t pa; - vm_map_offset_t vaddr; - int pvhash_idx; - uint32_t pv_cnt; - - num_removed = 0; - num_unwired = 0; - num_found = 0; - - if (pmap != kernel_pmap && - pmap->pm_task_map == TASK_MAP_32BIT && - start_vaddr >= HIGH_MEM_BASE) { - /* - * The range is in the "high_shared_pde" which is shared - * between the kernel and all 32-bit tasks. It holds - * the 32-bit commpage but also the trampolines, GDT, etc... - * so we can't let user tasks remove anything from it. - */ - return; - } - - /* invalidate the PTEs first to "freeze" them */ - for (cpte = spte, vaddr = start_vaddr; - cpte < epte; - cpte++, vaddr += PAGE_SIZE_64) { - - pa = pte_to_pa(*cpte); - if (pa == 0) - continue; - num_found++; - - if (iswired(*cpte)) - num_unwired++; - - pai = pa_index(pa); - - if (!managed_page(pai)) { - /* - * Outside range of managed physical memory. - * Just remove the mappings. - */ - pmap_store_pte(cpte, 0); - continue; - } - - /* invalidate the PTE */ - pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); - } - - if (num_found == 0) { - /* nothing was changed: we're done */ - goto update_counts; - } - - /* propagate the invalidates to other CPUs */ - - PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr); - - for (cpte = spte, vaddr = start_vaddr; - cpte < epte; - cpte++, vaddr += PAGE_SIZE_64) { - - pa = pte_to_pa(*cpte); - if (pa == 0) - continue; - - pai = pa_index(pa); - - LOCK_PVH(pai); - - pa = pte_to_pa(*cpte); - if (pa == 0) { - UNLOCK_PVH(pai); - continue; - } - - num_removed++; - - /* - * Get the modify and reference bits, then - * nuke the entry in the page table - */ - /* remember reference and change */ - pmap_phys_attributes[pai] |= - (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED)); - /* completely invalidate the PTE */ - pmap_store_pte(cpte, 0); - - /* - * Remove the mapping from the pvlist for - * this physical page. - */ - { - pv_rooted_entry_t pv_h; - pv_hashed_entry_t *pprevh; - ppnum_t ppn = (ppnum_t)pai; - - pv_h = pai_to_pvh(pai); - pvh_e = PV_HASHED_ENTRY_NULL; - if (pv_h->pmap == PMAP_NULL) - panic("pmap_remove_range: null pv_list!"); - - if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */ - /* - * Header is the pv_rooted_entry. We can't free that. If there is a queued - * entry after this one we remove that - * from the ppn queue, we remove it from the hash chain - * and copy it to the rooted entry. Then free it instead. - */ - - pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink); - if (pv_h != (pv_rooted_entry_t)pvh_e) { /* any queued after rooted? */ - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - { - pprevh = pvhash(pvhash_idx); - if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_remove_range empty hash removing rooted pv"); - } - } - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); - pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; /* dispose of pvh_e */ - } else { /* none queued after rooted */ - pv_h->pmap = PMAP_NULL; - pvh_e = PV_HASHED_ENTRY_NULL; - } /* any queued after rooted */ - - } else { /* rooted or not */ - /* not removing rooted pv. find it on hash chain, remove from ppn queue and - * hash chain and free it */ - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pmap,vaddr); - LOCK_PV_HASH(pvhash_idx); - pprevh = pvhash(pvhash_idx); - if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_remove_range empty hash removing hashed pv"); - } - pvh_e = *pprevh; - pmap_pv_hashlist_walks++; - pv_cnt = 0; - while (PV_HASHED_ENTRY_NULL != pvh_e) { - pv_cnt++; - if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break; - pprevh = &pvh_e->nexth; - pvh_e = pvh_e->nexth; - } - pmap_pv_hashlist_cnts += pv_cnt; - if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt; - if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash"); - *pprevh = pvh_e->nexth; - remque(&pvh_e->qlink); - UNLOCK_PV_HASH(pvhash_idx); - - } /* rooted or not */ - - UNLOCK_PVH(pai); - - if (pvh_e != PV_HASHED_ENTRY_NULL) { - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) { - pvh_et = pvh_e; - } - - pvh_cnt++; - } - - } /* removing mappings for this phy page */ - } /* for loop */ - - if (pvh_eh != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); - } - -update_counts: - /* - * Update the counts - */ -#if TESTING - if (pmap->stats.resident_count < num_removed) - panic("pmap_remove_range: resident_count"); -#endif - assert(pmap->stats.resident_count >= num_removed); - OSAddAtomic(-num_removed, &pmap->stats.resident_count); - -#if TESTING - if (pmap->stats.wired_count < num_unwired) - panic("pmap_remove_range: wired_count"); -#endif - assert(pmap->stats.wired_count >= num_unwired); - OSAddAtomic(-num_unwired, &pmap->stats.wired_count); - - return; -} /* * Remove phys addr if mapped in specified map @@ -2055,290 +1594,6 @@ pmap_remove_some_phys( } -/* - * Remove the given range of addresses - * from the specified map. - * - * It is assumed that the start and end are properly - * rounded to the hardware page size. - */ - - -void -pmap_remove( - pmap_t map, - addr64_t s64, - addr64_t e64) -{ - pt_entry_t *pde; - pt_entry_t *spte, *epte; - addr64_t l64; - addr64_t orig_s64; - uint64_t deadline; - - pmap_intr_assert(); - - if (map == PMAP_NULL || s64 == e64) - return; - - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, - (int) map, - (int) (s64>>32), (int) s64, - (int) (e64>>32), (int) e64); - - PMAP_LOCK(map); - -#if 0 - /* - * Check that address range in the kernel does not overlap the stacks. - * We initialize local static min/max variables once to avoid making - * 2 function calls for every remove. Note also that these functions - * both return 0 before kernel stacks have been initialized, and hence - * the panic is not triggered in this case. - */ - if (map == kernel_pmap) { - static vm_offset_t kernel_stack_min = 0; - static vm_offset_t kernel_stack_max = 0; - - if (kernel_stack_min == 0) { - kernel_stack_min = min_valid_stack_address(); - kernel_stack_max = max_valid_stack_address(); - } - if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) || - (kernel_stack_min < e64 && e64 <= kernel_stack_max)) - panic("pmap_remove() attempted in kernel stack"); - } -#else - - /* - * The values of kernel_stack_min and kernel_stack_max are no longer - * relevant now that we allocate kernel stacks anywhere in the kernel map, - * so the old code above no longer applies. If we wanted to check that - * we weren't removing a mapping of a page in a kernel stack we'd have to - * mark the PTE with an unused bit and check that here. - */ - -#endif - - deadline = rdtsc64() + max_preemption_latency_tsc; - - orig_s64 = s64; - - while (s64 < e64) { - l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1); - if (l64 > e64) - l64 = e64; - pde = pmap_pde(map, s64); - - if (pde && (*pde & INTEL_PTE_VALID)) { - spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1))); - spte = &spte[ptenum(s64)]; - epte = &spte[intel_btop(l64-s64)]; - - pmap_remove_range(map, s64, spte, epte); - } - s64 = l64; - pde++; - - if (s64 < e64 && rdtsc64() >= deadline) { - PMAP_UNLOCK(map) - PMAP_LOCK(map) - - deadline = rdtsc64() + max_preemption_latency_tsc; - } - - } - - PMAP_UNLOCK(map); - - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END, - (int) map, 0, 0, 0, 0); - -} - -/* - * Routine: pmap_page_protect - * - * Function: - * Lower the permission for all mappings to a given - * page. - */ -void -pmap_page_protect( - ppnum_t pn, - vm_prot_t prot) -{ - pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t nexth; - int pvh_cnt = 0; - pv_rooted_entry_t pv_h; - pv_rooted_entry_t pv_e; - pv_hashed_entry_t pvh_e; - pt_entry_t *pte; - int pai; - register pmap_t pmap; - boolean_t remove; - int pvhash_idx; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!managed_page(pai)) { - /* - * Not a managed page. - */ - return; - } - - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, - (int) pn, (int) prot, 0, 0, 0); - - /* - * Determine the new protection. - */ - switch (prot) { - case VM_PROT_READ: - case VM_PROT_READ|VM_PROT_EXECUTE: - remove = FALSE; - break; - case VM_PROT_ALL: - return; /* nothing to do */ - default: - remove = TRUE; - break; - } - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - - /* - * Walk down PV list, changing or removing all mappings. - */ - if (pv_h->pmap != PMAP_NULL) { - - pv_e = pv_h; - pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */ - - do { - register vm_map_offset_t vaddr; - pmap = pv_e->pmap; - - vaddr = pv_e->va; - pte = pmap_pte(pmap, vaddr); - - if (0 == pte) { - panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap, pn, vaddr, prot, kernel_pmap); - } - - nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); /* if there is one */ - - /* - * Remove the mapping if new protection is NONE - * or if write-protecting a kernel mapping. - */ - if (remove || pmap == kernel_pmap) { - /* - * Remove the mapping, collecting any modify bits. - */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); - - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - - pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - - pmap_store_pte(pte, 0); - -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_page_protect: resident_count"); -#endif - assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, &pmap->stats.resident_count); - - /* - * Deal with the pv_rooted_entry. - */ - - if (pv_e == pv_h) { - /* - * Fix up head later. - */ - pv_h->pmap = PMAP_NULL; - } - else { - /* - * Delete this entry. - */ - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pvh_cnt++; - } - } else { - /* - * Write-protect. - */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE)); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - } - - pvh_e = nexth; - } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h); - - - /* - * If pv_head mapping was removed, fix it up. - */ - - if (pv_h->pmap == PMAP_NULL) { - pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink); - - if (pvh_e != (pv_hashed_entry_t)pv_h) { - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); - pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pvh_cnt++; - } - } - } - if (pvh_eh != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); - } - - UNLOCK_PVH(pai); - - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END, - 0, 0, 0, 0, 0); - -} - - /* * Routine: * pmap_disconnect @@ -2459,427 +1714,6 @@ pmap_map_block( } -/* - * Insert the given physical page (p) at - * the specified virtual address (v) in the - * target physical map with the protection requested. - * - * If specified, the page will be wired down, meaning - * that the related pte cannot be reclaimed. - * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. - */ -void -pmap_enter( - register pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t pn, - vm_prot_t prot, - unsigned int flags, - boolean_t wired) -{ - register pt_entry_t *pte; - register pv_rooted_entry_t pv_h; - register int pai; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_new; - pv_hashed_entry_t *hashp; - pt_entry_t template; - pmap_paddr_t old_pa; - pmap_paddr_t pa = (pmap_paddr_t)i386_ptob(pn); - boolean_t need_tlbflush = FALSE; - boolean_t set_NX; - char oattr; - int pvhash_idx; - uint32_t pv_cnt; - boolean_t old_pa_locked; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pmap_debug) - printf("pmap(%qx, %x)\n", vaddr, pn); - if (pmap == PMAP_NULL) - return; - if (pn == vm_page_guard_addr) - return; - - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, - (int) pmap, - (int) (vaddr>>32), (int) vaddr, - (int) pn, prot); - - if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled ) - set_NX = FALSE; - else - set_NX = TRUE; - - /* - * Must allocate a new pvlist entry while we're unlocked; - * zalloc may cause pageout (which will lock the pmap system). - * If we determine we need a pvlist entry, we will unlock - * and allocate one. Then we will retry, throughing away - * the allocated entry later (if we no longer need it). - */ - - pvh_new = PV_HASHED_ENTRY_NULL; -Retry: - pvh_e = PV_HASHED_ENTRY_NULL; - - PMAP_LOCK(pmap); - - /* - * Expand pmap to include this pte. Assume that - * pmap is always expanded to include enough hardware - * pages to map one VM page. - */ - - while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) { - /* - * Must unlock to expand the pmap. - */ - PMAP_UNLOCK(pmap); - pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */ - PMAP_LOCK(pmap); - } - - old_pa = pte_to_pa(*pte); - pai = pa_index(old_pa); - old_pa_locked = FALSE; - - /* - * if we have a previous managed page, lock the pv entry now. after - * we lock it, check to see if someone beat us to the lock and if so - * drop the lock - */ - - if ((0 != old_pa) && managed_page(pai)) { - LOCK_PVH(pai); - old_pa_locked = TRUE; - old_pa = pte_to_pa(*pte); - if (0 == old_pa) { - UNLOCK_PVH(pai); /* some other path beat us to it */ - old_pa_locked = FALSE; - } - } - - - /* - * Special case if the incoming physical page is already mapped - * at this address. - */ - if (old_pa == pa) { - - /* - * May be changing its wired attribute or protection - */ - - template = pa_to_pte(pa) | INTEL_PTE_VALID; - - if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { - if(!(flags & VM_MEM_GUARDED)) - template |= INTEL_PTE_PTA; - template |= INTEL_PTE_NCACHE; - } - - if (pmap != kernel_pmap) - template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; - - if (set_NX == TRUE) - template |= INTEL_PTE_NX; - - if (wired) { - template |= INTEL_PTE_WIRED; - if (!iswired(*pte)) - OSAddAtomic(+1, &pmap->stats.wired_count); - } - else { - if (iswired(*pte)) { - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, &pmap->stats.wired_count); - } - } - - /* store modified PTE and preserve RC bits */ - pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); - if (old_pa_locked) { - UNLOCK_PVH(pai); - old_pa_locked = FALSE; - } - need_tlbflush = TRUE; - goto Done; - } - - /* - * Outline of code from here: - * 1) If va was mapped, update TLBs, remove the mapping - * and remove old pvlist entry. - * 2) Add pvlist entry for new mapping - * 3) Enter new mapping. - * - * If the old physical page is not managed step 1) is skipped - * (except for updating the TLBs), and the mapping is - * overwritten at step 3). If the new physical page is not - * managed, step 2) is skipped. - */ - - if (old_pa != (pmap_paddr_t) 0) { - - /* - * Don't do anything to pages outside valid memory here. - * Instead convince the code that enters a new mapping - * to overwrite the old one. - */ - - /* invalidate the PTE */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); - /* propagate invalidate everywhere */ - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - /* remember reference and change */ - oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED)); - /* completely invalidate the PTE */ - pmap_store_pte(pte, 0); - - if (managed_page(pai)) { -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_enter: resident_count"); -#endif - assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, &pmap->stats.resident_count); - - if (iswired(*pte)) { - -#if TESTING - if (pmap->stats.wired_count < 1) - panic("pmap_enter: wired_count"); -#endif - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, &pmap->stats.wired_count); - } - - pmap_phys_attributes[pai] |= oattr; - /* - * Remove the mapping from the pvlist for - * this physical page. - * We'll end up with either a rooted pv or a - * hashed pv - */ - { - - pv_h = pai_to_pvh(pai); - - if (pv_h->pmap == PMAP_NULL) { - panic("pmap_enter: null pv_list!"); - } - - if (pv_h->va == vaddr && pv_h->pmap == pmap) { - /* - * Header is the pv_rooted_entry. - * If there is a next one, copy it to the - * header and free the next one (we cannot - * free the header) - */ - pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink); - if (pvh_e != (pv_hashed_entry_t)pv_h) { - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); - pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; - } - else { - pv_h->pmap = PMAP_NULL; - pvh_e = PV_HASHED_ENTRY_NULL; - } - } - else { - pv_hashed_entry_t *pprevh; - ppnum_t old_ppn; - /* wasn't the rooted pv - hash, find it, and unlink it */ - old_ppn = (ppnum_t)pa_index(old_pa); - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pmap,vaddr); - LOCK_PV_HASH(pvhash_idx); - pprevh = pvhash(pvhash_idx); -#if PV_DEBUG - if (NULL==pprevh)panic("pmap enter 1"); -#endif - pvh_e = *pprevh; - pmap_pv_hashlist_walks++; - pv_cnt = 0; - while (PV_HASHED_ENTRY_NULL != pvh_e) { - pv_cnt++; - if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break; - pprevh = &pvh_e->nexth; - pvh_e = pvh_e->nexth; - } - pmap_pv_hashlist_cnts += pv_cnt; - if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt; - if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list"); - if(NULL==pprevh)panic("pmap enter 2"); - *pprevh = pvh_e->nexth; - remque(&pvh_e->qlink); - UNLOCK_PV_HASH(pvhash_idx); - } - } - } - else { - /* - * old_pa is not managed. - * Do removal part of accounting. - */ - - if (iswired(*pte)) { - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, &pmap->stats.wired_count); - } - } - } - - /* - * if we had a previously managed paged locked, unlock it now - */ - - if (old_pa_locked) { - UNLOCK_PVH(pai); - old_pa_locked = FALSE; - } - - pai = pa_index(pa); /* now working with new incoming phys page */ - if (managed_page(pai)) { - - /* - * Step 2) Enter the mapping in the PV list for this - * physical page. - */ - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - if (pv_h->pmap == PMAP_NULL) { - /* - * No mappings yet, use rooted pv - */ - pv_h->va = vaddr; - pv_h->pmap = pmap; - queue_init(&pv_h->qlink); - } - else { - /* - * Add new pv_hashed_entry after header. - */ - if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) { - pvh_e = pvh_new; - pvh_new = PV_HASHED_ENTRY_NULL; /* show we used it */ - } else if (PV_HASHED_ENTRY_NULL == pvh_e) { - PV_HASHED_ALLOC(pvh_e); - if (PV_HASHED_ENTRY_NULL == pvh_e) { - /* the pv list is empty. - * if we are on the kernel pmap we'll use one of the special private - * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e, - * and restart bringing in the pv_e with us. - */ - if (kernel_pmap == pmap) { - PV_HASHED_KERN_ALLOC(pvh_e); - } else { - UNLOCK_PVH(pai); - PMAP_UNLOCK(pmap); - pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - goto Retry; - } - } - } - - if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion"); - pvh_e->va = vaddr; - pvh_e->pmap = pmap; - pvh_e->ppn = pn; - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pmap,vaddr); - LOCK_PV_HASH(pvhash_idx); - insque(&pvh_e->qlink, &pv_h->qlink); - hashp = pvhash(pvhash_idx); -#if PV_DEBUG - if(NULL==hashp)panic("pmap_enter 4"); -#endif - pvh_e->nexth = *hashp; - *hashp = pvh_e; - UNLOCK_PV_HASH(pvhash_idx); - - /* - * Remember that we used the pvlist entry. - */ - pvh_e = PV_HASHED_ENTRY_NULL; - } - - /* - * only count the mapping - * for 'managed memory' - */ - OSAddAtomic(+1, &pmap->stats.resident_count); - if (pmap->stats.resident_count > pmap->stats.resident_max) { - pmap->stats.resident_max = pmap->stats.resident_count; - } - } - - /* - * Step 3) Enter the mapping. - * - * Build a template to speed up entering - - * only the pfn changes. - */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; - - if (flags & VM_MEM_NOT_CACHEABLE) { - if(!(flags & VM_MEM_GUARDED)) - template |= INTEL_PTE_PTA; - template |= INTEL_PTE_NCACHE; - } - - if (pmap != kernel_pmap) - template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; - - if (set_NX == TRUE) - template |= INTEL_PTE_NX; - - if (wired) { - template |= INTEL_PTE_WIRED; - OSAddAtomic(+1, &pmap->stats.wired_count); - } - pmap_store_pte(pte, template); - - /* if this was a managed page we delayed unlocking the pv until here - * to prevent pmap_page_protect et al from finding it until the pte - * has been stored */ - - if (managed_page(pai)) { - UNLOCK_PVH(pai); - } - -Done: - if (need_tlbflush == TRUE) - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - - if (pvh_e != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1); - } - - if (pvh_new != PV_HASHED_ENTRY_NULL) { - PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1); - } - - PMAP_UNLOCK(pmap); - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address @@ -3917,95 +2751,6 @@ phys_page_exists( return TRUE; } -void -mapping_free_prime(void) -{ - int i; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - -} - -void -mapping_adjust(void) -{ - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - int i; - - if (mapping_adjust_call == NULL) { - thread_call_setup(&mapping_adjust_call_data, - (thread_call_func_t) mapping_adjust, - (thread_call_param_t) NULL); - mapping_adjust_call = &mapping_adjust_call_data; - } - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } - mappingrecurse = 0; -} - void pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt) { diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 9e6d65d20..0acf265d2 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -432,7 +432,8 @@ enum high_fixed_addresses { #define INTEL_PTE_NX (1ULL << 63) #define INTEL_PTE_INVALID 0 - +/* This is conservative, but suffices */ +#define INTEL_PTE_RSVD ((1ULL << 8) | (1ULL << 9) | (1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54)) #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */ #define pte_increment_pa(p) ((p) += INTEL_OFFMASK+1) @@ -704,7 +705,7 @@ extern vm_offset_t pmap_high_shared_remap(enum high_fixed_addresses, vm_offset_t #endif extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, int *, int *); - +extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2)); /* diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 1a1105399..04f4aa008 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -28,6 +28,7 @@ #include #include +#include #ifdef MACH_KERNEL_PRIVATE @@ -43,7 +44,6 @@ simple_unlock(&(pmap)->lock); \ } -extern void pmap_flush_tlbs(pmap_t pmap); #define PMAP_UPDATE_TLBS(pmap, s, e) \ pmap_flush_tlbs(pmap) @@ -67,10 +67,698 @@ void pmap_expand_pml4( void pmap_expand_pdpt( pmap_t map, vm_map_offset_t v); +extern void pmap_flush_tlbs(pmap_t pmap); + #if defined(__x86_64__) extern const boolean_t cpu_64bit; #else extern boolean_t cpu_64bit; #endif +/* + * Private data structures. + */ + +/* + * For each vm_page_t, there is a list of all currently + * valid virtual mappings of that page. An entry is + * a pv_rooted_entry_t; the list is the pv_table. + * + * N.B. with the new combo rooted/hashed scheme it is + * only possibly to remove individual non-rooted entries + * if they are found via the hashed chains as there is no + * way to unlink the singly linked hashed entries if navigated to + * via the queue list off the rooted entries. Think of it as + * hash/walk/pull, keeping track of the prev pointer while walking + * the singly linked hash list. All of this is to save memory and + * keep both types of pv_entries as small as possible. + */ + +/* + +PV HASHING Changes - JK 1/2007 + +Pve's establish physical to virtual mappings. These are used for aliasing of a +physical page to (potentially many) virtual addresses within pmaps. In the previous +implementation the structure of the pv_entries (each 16 bytes in size) was + +typedef struct pv_entry { + struct pv_entry_t next; + pmap_t pmap; + vm_map_offset_t va; +} *pv_entry_t; + +An initial array of these is created at boot time, one per physical page of memory, +indexed by the physical page number. Additionally, a pool of entries is created from a +pv_zone to be used as needed by pmap_enter() when it is creating new mappings. +Originally, we kept this pool around because the code in pmap_enter() was unable to +block if it needed an entry and none were available - we'd panic. Some time ago I +restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing +a pv structure and restart, removing a panic from the code (in the case of the kernel +pmap we cannot block and still panic, so, we keep a separate hot pool for use only on +kernel pmaps). The pool has not been removed since there is a large performance gain +keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need. + +As pmap_enter() created new mappings it linked the new pve's for them off the fixed +pv array for that ppn (off the next pointer). These pve's are accessed for several +operations, one of them being address space teardown. In that case, we basically do this + + for (every page/pte in the space) { + calc pve_ptr from the ppn in the pte + for (every pv in the list for the ppn) { + if (this pv is for this pmap/vaddr) { + do housekeeping + unlink/free the pv + } + } + } + +The problem arose when we were running, say 8000 (or even 2000) apache or other processes +and one or all terminate. The list hanging off each pv array entry could have thousands of +entries. We were continuously linearly searching each of these lists as we stepped through +the address space we were tearing down. Because of the locks we hold, likely taking a cache +miss for each node, and interrupt disabling for MP issues the system became completely +unresponsive for many seconds while we did this. + +Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn +for operations like pmap_page_protect and finding and modifying/removing a single pve as +part of pmap_enter processing) has led to modifying the pve structures and databases. + +There are now two types of pve structures. A "rooted" structure which is basically the +original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a +hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of +minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of +pages in the system are not aliased and hence represented by a single pv entry I've kept +the rooted entry size as small as possible because there is one of these dedicated for +every physical page of memory. The hashed pve's are larger due to the addition of the hash +link and the ppn entry needed for matching while running the hash list to find the entry we +are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs) +will pay the extra memory price. Both structures have the same first three fields allowing +some simplification in the code. + +They have these shapes + +typedef struct pv_rooted_entry { + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; +} *pv_rooted_entry_t; + + +typedef struct pv_hashed_entry { + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; +} *pv_hashed_entry_t; + +The main flow difference is that the code is now aware of the rooted entry and the hashed +entries. Code that runs the pv list still starts with the rooted entry and then continues +down the qlink onto the hashed entries. Code that is looking up a specific pv entry first +checks the rooted entry and then hashes and runs the hash list for the match. The hash list +lengths are much smaller than the original pv lists that contained all aliases for the specific ppn. + +*/ + +typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */ + queue_head_t qlink; + vm_map_offset_t va; /* virtual address for mapping */ + pmap_t pmap; /* pmap where mapping lies */ +} *pv_rooted_entry_t; + +#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) + + +typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */ + queue_head_t qlink; + vm_map_offset_t va; + pmap_t pmap; + ppnum_t ppn; + struct pv_hashed_entry *nexth; +} *pv_hashed_entry_t; + +#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) + +/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */ +#ifdef PV_DEBUG +#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); +#else +#define CHK_NPVHASH() +#endif + +#define NPVHASH 4095 /* MUST BE 2^N - 1 */ +#define PV_HASHED_LOW_WATER_MARK 5000 +#define PV_HASHED_KERN_LOW_WATER_MARK 400 +#define PV_HASHED_ALLOC_CHUNK 2000 +#define PV_HASHED_KERN_ALLOC_CHUNK 200 + +#define PV_HASHED_ALLOC(pvh_e) { \ + simple_lock(&pv_hashed_free_list_lock); \ + if ((pvh_e = pv_hashed_free_list) != 0) { \ + pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ + pv_hashed_free_count--; \ + if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ + if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ + thread_call_enter(mapping_adjust_call); \ + } \ + simple_unlock(&pv_hashed_free_list_lock); \ +} + +#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ + simple_lock(&pv_hashed_free_list_lock); \ + pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ + pv_hashed_free_list = pvh_eh; \ + pv_hashed_free_count += pv_cnt; \ + simple_unlock(&pv_hashed_free_list_lock); \ +} + +#define PV_HASHED_KERN_ALLOC(pvh_e) { \ + simple_lock(&pv_hashed_kern_free_list_lock); \ + if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ + pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ + pv_hashed_kern_free_count--; \ + if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \ + if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ + thread_call_enter(mapping_adjust_call); \ + } \ + simple_unlock(&pv_hashed_kern_free_list_lock); \ +} + +#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ + simple_lock(&pv_hashed_kern_free_list_lock); \ + pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ + pv_hashed_kern_free_list = pvh_eh; \ + pv_hashed_kern_free_count += pv_cnt; \ + simple_unlock(&pv_hashed_kern_free_list_lock); \ +} + +/* + * Index into pv_head table, its lock bits, and the modify/reference and managed bits + */ + +#define pa_index(pa) (i386_btop(pa)) +#define ppn_to_pai(ppn) ((int)ppn) + +#define pai_to_pvh(pai) (&pv_head_table[pai]) +#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) +#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) +#define pvhash(idx) (&pv_hash_table[idx]) + +#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) +#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) + +#define IS_MANAGED_PAGE(x) \ + ((unsigned int)(x) <= last_managed_page && \ + (pmap_phys_attributes[x] & PHYS_MANAGED)) + +/* + * Physical page attributes. Copy bits from PTE definition. + */ +#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ +#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ +#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ + +/* + * Amount of virtual memory mapped by one + * page-directory entry. + */ +#define PDE_MAPPED_SIZE (pdetova(1)) + + +/* + * Locking and TLB invalidation + */ + +/* + * Locking Protocols: (changed 2/2007 JK) + * + * There are two structures in the pmap module that need locking: + * the pmaps themselves, and the per-page pv_lists (which are locked + * by locking the pv_lock_table entry that corresponds to the pv_head + * for the list in question.) Most routines want to lock a pmap and + * then do operations in it that require pv_list locking -- however + * pmap_remove_all and pmap_copy_on_write operate on a physical page + * basis and want to do the locking in the reverse order, i.e. lock + * a pv_list and then go through all the pmaps referenced by that list. + * + * The system wide pmap lock has been removed. Now, paths take a lock + * on the pmap before changing its 'shape' and the reverse order lockers + * (coming in by phys ppn) take a lock on the corresponding pv and then + * retest to be sure nothing changed during the window before they locked + * and can then run up/down the pv lists holding the list lock. This also + * lets the pmap layer run (nearly completely) interrupt enabled, unlike + * previously. + */ + +/* + * PV locking + */ + +#define LOCK_PVH(index) { \ + mp_disable_preemption(); \ + lock_pvh_pai(index); \ +} + +#define UNLOCK_PVH(index) { \ + unlock_pvh_pai(index); \ + mp_enable_preemption(); \ +} +/* + * PV hash locking + */ + +#define LOCK_PV_HASH(hash) lock_hash_hash(hash) +#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) +extern uint32_t npvhash; +extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ +extern pv_hashed_entry_t pv_hashed_free_list; +extern pv_hashed_entry_t pv_hashed_kern_free_list; +decl_simple_lock_data(extern, pv_hashed_free_list_lock) +decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) +decl_simple_lock_data(extern, pv_hash_table_lock) + +extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ + +extern int pv_hashed_free_count; +extern int pv_hashed_kern_free_count; +#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) +extern char *pv_lock_table; /* pointer to array of bits */ + +extern char *pv_hash_lock_table; +extern pv_rooted_entry_t pv_head_table; /* array of entries, one + * per page */ +extern uint64_t pde_mapped_size; + +extern char *pmap_phys_attributes; +extern unsigned int last_managed_page; + +/* + * when spinning through pmap_remove + * ensure that we don't spend too much + * time with preemption disabled. + * I'm setting the current threshold + * to 20us + */ +#define MAX_PREEMPTION_LATENCY_NS 20000 +extern uint64_t max_preemption_latency_tsc; + +/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ +#ifdef DEBUGINTERRUPTS +#define pmap_intr_assert() { \ + if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ + panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ +} +#else +#define pmap_intr_assert() +#endif + +extern int nx_enabled; +extern unsigned int inuse_ptepages_count; + +static inline uint32_t +pvhashidx(pmap_t pmap, vm_map_offset_t va) +{ + return ((uint32_t)(uintptr_t)pmap ^ + ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & + npvhash; +} + +/* + * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. + * properly deals with the anchor. + * must be called with the hash locked, does not unlock it + */ + +static inline void +pmap_pvh_unlink(pv_hashed_entry_t pvh) +{ + pv_hashed_entry_t curh; + pv_hashed_entry_t *pprevh; + int pvhash_idx; + + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pvh->pmap, pvh->va); + + pprevh = pvhash(pvhash_idx); + +#if PV_DEBUG + if (NULL == *pprevh) + panic("pvh_unlink null anchor"); /* JK DEBUG */ +#endif + curh = *pprevh; + + while (PV_HASHED_ENTRY_NULL != curh) { + if (pvh == curh) + break; + pprevh = &curh->nexth; + curh = curh->nexth; + } + if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); + *pprevh = pvh->nexth; + return; +} + +static inline void +pv_hash_add(pv_hashed_entry_t pvh_e, + pv_rooted_entry_t pv_h) +{ + pv_hashed_entry_t *hashp; + int pvhash_idx; + + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + LOCK_PV_HASH(pvhash_idx); + insque(&pvh_e->qlink, &pv_h->qlink); + hashp = pvhash(pvhash_idx); +#if PV_DEBUG + if (NULL==hashp) + panic("pv_hash_add(%p) null hash bucket", pvh_e); +#endif + pvh_e->nexth = *hashp; + *hashp = pvh_e; + UNLOCK_PV_HASH(pvhash_idx); +} + +static inline void +pv_hash_remove(pv_hashed_entry_t pvh_e) +{ + int pvhash_idx; + + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); + LOCK_PV_HASH(pvhash_idx); + remque(&pvh_e->qlink); + pmap_pvh_unlink(pvh_e); + UNLOCK_PV_HASH(pvhash_idx); +} + +static inline boolean_t popcnt1(uint64_t distance) { + return ((distance & (distance - 1)) == 0); +} + +/* + * Routines to handle suppression of/recovery from some forms of pagetable corruption + * incidents observed in the field. These can be either software induced (wild + * stores to the mapwindows where applicable, use after free errors + * (typically of pages addressed physically), mis-directed DMAs etc., or due + * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, + * the recording mechanism is deliberately not MP-safe. The overarching goal is to + * still assert on potential software races, but attempt recovery from incidents + * identifiable as occurring due to issues beyond the control of the pmap module. + * The latter includes single-bit errors and malformed pagetable entries. + * We currently limit ourselves to recovery/suppression of one incident per + * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident + * are logged. + * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) + */ + +typedef enum { + PTE_VALID = 0x0, + PTE_INVALID = 0x1, + PTE_RSVD = 0x2, + PTE_SUPERVISOR = 0x4, + PTE_BITFLIP = 0x8, + PV_BITFLIP = 0x10, + PTE_INVALID_CACHEABILITY = 0x20 +} pmap_pagetable_corruption_t; + +typedef enum { + ROOT_PRESENT = 0, + ROOT_ABSENT = 1 +} pmap_pv_assertion_t; + +typedef enum { + PMAP_ACTION_IGNORE = 0x0, + PMAP_ACTION_ASSERT = 0x1, + PMAP_ACTION_RETRY = 0x2, + PMAP_ACTION_RETRY_RELOCK = 0x4 +} pmap_pagetable_corruption_action_t; + +#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) +extern uint64_t pmap_pagetable_corruption_interval_abstime; + +extern uint32_t pmap_pagetable_corruption_incidents; +#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) +typedef struct { + pmap_pv_assertion_t incident; + pmap_pagetable_corruption_t reason; + pmap_pagetable_corruption_action_t action; + pmap_t pmap; + vm_map_offset_t vaddr; + pt_entry_t pte; + ppnum_t ppn; + pmap_t pvpmap; + vm_map_offset_t pvva; + uint64_t abstime; +} pmap_pagetable_corruption_record_t; + +extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; +extern uint64_t pmap_pagetable_corruption_last_abstime; +extern thread_call_t pmap_pagetable_corruption_log_call; +extern boolean_t pmap_pagetable_corruption_timeout; + +static inline void +pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { + uint32_t pmap_pagetable_corruption_log_index; + pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; + pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); + /* Asynchronously log */ + thread_call_enter(pmap_pagetable_corruption_log_call); +} + +static inline pmap_pagetable_corruption_action_t +pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { + pmap_pv_assertion_t action = PMAP_ACTION_ASSERT; + pmap_pagetable_corruption_t suppress_reason = PTE_VALID; + ppnum_t suppress_ppn = 0; + pt_entry_t cpte = *ptep; + ppnum_t cpn = pa_index(pte_to_pa(cpte)); + ppnum_t ppn = *ppnp; + pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); + pv_rooted_entry_t pv_e = pv_h; + uint32_t bitdex; + pmap_t pvpmap = pv_h->pmap; + vm_map_offset_t pvva = pv_h->va; + boolean_t ppcd = FALSE; + + /* Ideally, we'd consult the Mach VM here to definitively determine + * the nature of the mapping for this address space and address. + * As that would be a layering violation in this context, we + * use various heuristics to recover from single bit errors, + * malformed pagetable entries etc. These are not intended + * to be comprehensive. + */ + + /* As a precautionary measure, mark A+D */ + pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); + + /* + * Correct potential single bit errors in either (but not both) element + * of the PV + */ + do { + if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || + (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { + pv_e->pmap = pmap; + pv_e->va = vaddr; + suppress_reason = PV_BITFLIP; + action = PMAP_ACTION_RETRY; + goto pmap_cpc_exit; + } + } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h); + + /* Discover root entries with a Hamming + * distance of 1 from the supplied + * physical page frame. + */ + for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { + ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); + if (IS_MANAGED_PAGE(npn)) { + pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); + if (npv_h->va == vaddr && npv_h->pmap == pmap) { + suppress_reason = PTE_BITFLIP; + suppress_ppn = npn; + action = PMAP_ACTION_RETRY_RELOCK; + UNLOCK_PVH(ppn_to_pai(ppn)); + *ppnp = npn; + goto pmap_cpc_exit; + } + } + } + + if (pmap == kernel_pmap) { + action = PMAP_ACTION_ASSERT; + goto pmap_cpc_exit; + } + + /* Check for malformed/inconsistent entries */ + + if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { + action = PMAP_ACTION_IGNORE; + suppress_reason = PTE_INVALID_CACHEABILITY; + } + else if (cpte & INTEL_PTE_RSVD) { + action = PMAP_ACTION_IGNORE; + suppress_reason = PTE_RSVD; + } + else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { + action = PMAP_ACTION_IGNORE; + suppress_reason = PTE_SUPERVISOR; + } +pmap_cpc_exit: + PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); + + if (debug_boot_arg && !ppcd) { + action = PMAP_ACTION_ASSERT; + } + + if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { + action = PMAP_ACTION_ASSERT; + pmap_pagetable_corruption_timeout = TRUE; + } + else + { + pmap_pagetable_corruption_last_abstime = mach_absolute_time(); + } + pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); + return action; +} +/* + * Remove pv list entry. + * Called with pv_head_table entry locked. + * Returns pv entry to be freed (or NULL). + */ + +static inline __attribute__((always_inline)) pv_hashed_entry_t +pmap_pv_remove( pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t *ppnp, + pt_entry_t *pte) +{ + pv_hashed_entry_t pvh_e; + pv_rooted_entry_t pv_h; + pv_hashed_entry_t *pprevh; + int pvhash_idx; + uint32_t pv_cnt; + ppnum_t ppn; + +pmap_pv_remove_retry: + ppn = *ppnp; + pvh_e = PV_HASHED_ENTRY_NULL; + pv_h = pai_to_pvh(ppn_to_pai(ppn)); + + if (pv_h->pmap == PMAP_NULL) { + pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); + if (pac == PMAP_ACTION_IGNORE) + goto pmap_pv_remove_exit; + else if (pac == PMAP_ACTION_ASSERT) + panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte); + else if (pac == PMAP_ACTION_RETRY_RELOCK) { + LOCK_PVH(ppn_to_pai(*ppnp)); + pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); + goto pmap_pv_remove_retry; + } + else if (pac == PMAP_ACTION_RETRY) + goto pmap_pv_remove_retry; + } + + if (pv_h->va == vaddr && pv_h->pmap == pmap) { + /* + * Header is the pv_rooted_entry. + * We can't free that. If there is a queued + * entry after this one we remove that + * from the ppn queue, we remove it from the hash chain + * and copy it to the rooted entry. Then free it instead. + */ + pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); + if (pv_h != (pv_rooted_entry_t) pvh_e) { + /* + * Entry queued to root, remove this from hash + * and install as new root. + */ + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); + LOCK_PV_HASH(pvhash_idx); + remque(&pvh_e->qlink); + pprevh = pvhash(pvhash_idx); + if (PV_HASHED_ENTRY_NULL == *pprevh) { + panic("pmap_pv_remove(%p,0x%llx,0x%x): " + "empty hash, removing rooted", + pmap, vaddr, ppn); + } + pmap_pvh_unlink(pvh_e); + UNLOCK_PV_HASH(pvhash_idx); + pv_h->pmap = pvh_e->pmap; + pv_h->va = pvh_e->va; /* dispose of pvh_e */ + } else { + /* none queued after rooted */ + pv_h->pmap = PMAP_NULL; + pvh_e = PV_HASHED_ENTRY_NULL; + } + } else { + /* + * not removing rooted pv. find it on hash chain, remove from + * ppn queue and hash chain and free it + */ + CHK_NPVHASH(); + pvhash_idx = pvhashidx(pmap, vaddr); + LOCK_PV_HASH(pvhash_idx); + pprevh = pvhash(pvhash_idx); + if (PV_HASHED_ENTRY_NULL == *pprevh) { + panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn); + } + pvh_e = *pprevh; + pmap_pv_hashlist_walks++; + pv_cnt = 0; + while (PV_HASHED_ENTRY_NULL != pvh_e) { + pv_cnt++; + if (pvh_e->pmap == pmap && + pvh_e->va == vaddr && + pvh_e->ppn == ppn) + break; + pprevh = &pvh_e->nexth; + pvh_e = pvh_e->nexth; + } + if (PV_HASHED_ENTRY_NULL == pvh_e) { + pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); + + if (pac == PMAP_ACTION_ASSERT) + panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va); + else { + UNLOCK_PV_HASH(pvhash_idx); + if (pac == PMAP_ACTION_RETRY_RELOCK) { + LOCK_PVH(ppn_to_pai(*ppnp)); + pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); + goto pmap_pv_remove_retry; + } + else if (pac == PMAP_ACTION_RETRY) { + goto pmap_pv_remove_retry; + } + else if (pac == PMAP_ACTION_IGNORE) { + goto pmap_pv_remove_exit; + } + } + } + pmap_pv_hashlist_cnts += pv_cnt; + if (pmap_pv_hashlist_max < pv_cnt) + pmap_pv_hashlist_max = pv_cnt; + *pprevh = pvh_e->nexth; + remque(&pvh_e->qlink); + UNLOCK_PV_HASH(pvhash_idx); + } +pmap_pv_remove_exit: + return pvh_e; +} + #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index de9b75835..53c1996e1 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -28,6 +28,28 @@ #include #include #include + + +void pmap_remove_range( + pmap_t pmap, + vm_map_offset_t va, + pt_entry_t *spte, + pt_entry_t *epte); + +pv_rooted_entry_t pv_head_table; /* array of entries, one per + * page */ +thread_call_t mapping_adjust_call; +static thread_call_data_t mapping_adjust_call_data; +uint32_t mappingrecurse = 0; + +pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[PMAP_PAGETABLE_CORRUPTION_MAX_LOG]; +uint32_t pmap_pagetable_corruption_incidents; +uint64_t pmap_pagetable_corruption_last_abstime = (~(0ULL) >> 1); +uint64_t pmap_pagetable_corruption_interval_abstime; +thread_call_t pmap_pagetable_corruption_log_call; +static thread_call_data_t pmap_pagetable_corruption_log_call_data; +boolean_t pmap_pagetable_corruption_timeout = FALSE; + /* * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time, * on a NBPDE boundary. @@ -315,3 +337,942 @@ pmap_find_phys(pmap_t pmap, addr64_t va) return ppn; } +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte cannot be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +void +pmap_enter( + register pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t pn, + vm_prot_t prot, + unsigned int flags, + boolean_t wired) +{ + pt_entry_t *pte; + pv_rooted_entry_t pv_h; + int pai; + pv_hashed_entry_t pvh_e; + pv_hashed_entry_t pvh_new; + pt_entry_t template; + pmap_paddr_t old_pa; + pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn); + boolean_t need_tlbflush = FALSE; + boolean_t set_NX; + char oattr; + boolean_t old_pa_locked; + /* 2MiB mappings are confined to x86_64 by VM */ + boolean_t superpage = flags & VM_MEM_SUPERPAGE; + vm_object_t delpage_pm_obj = NULL; + int delpage_pde_index = 0; + pt_entry_t old_pte; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + + if (pmap == PMAP_NULL) + return; + if (pn == vm_page_guard_addr) + return; + + PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, + pmap, + (uint32_t) (vaddr >> 32), (uint32_t) vaddr, + pn, prot); + + if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) + set_NX = FALSE; + else + set_NX = TRUE; + + /* + * Must allocate a new pvlist entry while we're unlocked; + * zalloc may cause pageout (which will lock the pmap system). + * If we determine we need a pvlist entry, we will unlock + * and allocate one. Then we will retry, throughing away + * the allocated entry later (if we no longer need it). + */ + + pvh_new = PV_HASHED_ENTRY_NULL; +Retry: + pvh_e = PV_HASHED_ENTRY_NULL; + + PMAP_LOCK(pmap); + + /* + * Expand pmap to include this pte. Assume that + * pmap is always expanded to include enough hardware + * pages to map one VM page. + */ + if(superpage) { + while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { + /* need room for another pde entry */ + PMAP_UNLOCK(pmap); + pmap_expand_pdpt(pmap, vaddr); + PMAP_LOCK(pmap); + } + } else { + while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) { + /* + * Must unlock to expand the pmap + * going to grow pde level page(s) + */ + PMAP_UNLOCK(pmap); + pmap_expand(pmap, vaddr); + PMAP_LOCK(pmap); + } + } + + if (superpage && *pte && !(*pte & INTEL_PTE_PS)) { + /* + * There is still an empty page table mapped that + * was used for a previous base page mapping. + * Remember the PDE and the PDE index, so that we + * can free the page at the end of this function. + */ + delpage_pde_index = (int)pdeidx(pmap, vaddr); + delpage_pm_obj = pmap->pm_obj; + *pte = 0; + } + + + old_pa = pte_to_pa(*pte); + pai = pa_index(old_pa); + old_pa_locked = FALSE; + + /* + * if we have a previous managed page, lock the pv entry now. after + * we lock it, check to see if someone beat us to the lock and if so + * drop the lock + */ + if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) { + LOCK_PVH(pai); + old_pa_locked = TRUE; + old_pa = pte_to_pa(*pte); + if (0 == old_pa) { + UNLOCK_PVH(pai); /* another path beat us to it */ + old_pa_locked = FALSE; + } + } + + /* + * Special case if the incoming physical page is already mapped + * at this address. + */ + if (old_pa == pa) { + + /* + * May be changing its wired attribute or protection + */ + + template = pa_to_pte(pa) | INTEL_PTE_VALID; + + if (VM_MEM_NOT_CACHEABLE == + (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { + if (!(flags & VM_MEM_GUARDED)) + template |= INTEL_PTE_PTA; + template |= INTEL_PTE_NCACHE; + } + if (pmap != kernel_pmap) + template |= INTEL_PTE_USER; + if (prot & VM_PROT_WRITE) + template |= INTEL_PTE_WRITE; + + if (set_NX) + template |= INTEL_PTE_NX; + + if (wired) { + template |= INTEL_PTE_WIRED; + if (!iswired(*pte)) + OSAddAtomic(+1, + &pmap->stats.wired_count); + } else { + if (iswired(*pte)) { + assert(pmap->stats.wired_count >= 1); + OSAddAtomic(-1, + &pmap->stats.wired_count); + } + } + if (superpage) /* this path can not be used */ + template |= INTEL_PTE_PS; /* to change the page size! */ + + /* store modified PTE and preserve RC bits */ + pmap_update_pte(pte, *pte, + template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); + if (old_pa_locked) { + UNLOCK_PVH(pai); + old_pa_locked = FALSE; + } + need_tlbflush = TRUE; + goto Done; + } + + /* + * Outline of code from here: + * 1) If va was mapped, update TLBs, remove the mapping + * and remove old pvlist entry. + * 2) Add pvlist entry for new mapping + * 3) Enter new mapping. + * + * If the old physical page is not managed step 1) is skipped + * (except for updating the TLBs), and the mapping is + * overwritten at step 3). If the new physical page is not + * managed, step 2) is skipped. + */ + + if (old_pa != (pmap_paddr_t) 0) { + + /* + * Don't do anything to pages outside valid memory here. + * Instead convince the code that enters a new mapping + * to overwrite the old one. + */ + + /* invalidate the PTE */ + pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); + /* propagate invalidate everywhere */ + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + /* remember reference and change */ + old_pte = *pte; + oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED)); + /* completely invalidate the PTE */ + pmap_store_pte(pte, 0); + + if (IS_MANAGED_PAGE(pai)) { +#if TESTING + if (pmap->stats.resident_count < 1) + panic("pmap_enter: resident_count"); +#endif + assert(pmap->stats.resident_count >= 1); + OSAddAtomic(-1, + &pmap->stats.resident_count); + + if (iswired(*pte)) { +#if TESTING + if (pmap->stats.wired_count < 1) + panic("pmap_enter: wired_count"); +#endif + assert(pmap->stats.wired_count >= 1); + OSAddAtomic(-1, + &pmap->stats.wired_count); + } + pmap_phys_attributes[pai] |= oattr; + + /* + * Remove the mapping from the pvlist for + * this physical page. + * We'll end up with either a rooted pv or a + * hashed pv + */ + pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte); + + } else { + + /* + * old_pa is not managed. + * Do removal part of accounting. + */ + + if (iswired(*pte)) { + assert(pmap->stats.wired_count >= 1); + OSAddAtomic(-1, + &pmap->stats.wired_count); + } + } + } + + /* + * if we had a previously managed paged locked, unlock it now + */ + if (old_pa_locked) { + UNLOCK_PVH(pai); + old_pa_locked = FALSE; + } + + pai = pa_index(pa); /* now working with new incoming phys page */ + if (IS_MANAGED_PAGE(pai)) { + + /* + * Step 2) Enter the mapping in the PV list for this + * physical page. + */ + pv_h = pai_to_pvh(pai); + + LOCK_PVH(pai); + + if (pv_h->pmap == PMAP_NULL) { + /* + * No mappings yet, use rooted pv + */ + pv_h->va = vaddr; + pv_h->pmap = pmap; + queue_init(&pv_h->qlink); + } else { + /* + * Add new pv_hashed_entry after header. + */ + if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) { + pvh_e = pvh_new; + pvh_new = PV_HASHED_ENTRY_NULL; + } else if (PV_HASHED_ENTRY_NULL == pvh_e) { + PV_HASHED_ALLOC(pvh_e); + if (PV_HASHED_ENTRY_NULL == pvh_e) { + /* + * the pv list is empty. if we are on + * the kernel pmap we'll use one of + * the special private kernel pv_e's, + * else, we need to unlock + * everything, zalloc a pv_e, and + * restart bringing in the pv_e with + * us. + */ + if (kernel_pmap == pmap) { + PV_HASHED_KERN_ALLOC(pvh_e); + } else { + UNLOCK_PVH(pai); + PMAP_UNLOCK(pmap); + pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + goto Retry; + } + } + } + + if (PV_HASHED_ENTRY_NULL == pvh_e) + panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings"); + + pvh_e->va = vaddr; + pvh_e->pmap = pmap; + pvh_e->ppn = pn; + pv_hash_add(pvh_e, pv_h); + + /* + * Remember that we used the pvlist entry. + */ + pvh_e = PV_HASHED_ENTRY_NULL; + } + + /* + * only count the mapping + * for 'managed memory' + */ + OSAddAtomic(+1, & pmap->stats.resident_count); + if (pmap->stats.resident_count > pmap->stats.resident_max) { + pmap->stats.resident_max = pmap->stats.resident_count; + } + } + /* + * Step 3) Enter the mapping. + * + * Build a template to speed up entering - + * only the pfn changes. + */ + template = pa_to_pte(pa) | INTEL_PTE_VALID; + + if (flags & VM_MEM_NOT_CACHEABLE) { + if (!(flags & VM_MEM_GUARDED)) + template |= INTEL_PTE_PTA; + template |= INTEL_PTE_NCACHE; + } + if (pmap != kernel_pmap) + template |= INTEL_PTE_USER; + if (prot & VM_PROT_WRITE) + template |= INTEL_PTE_WRITE; + if (set_NX) + template |= INTEL_PTE_NX; + if (wired) { + template |= INTEL_PTE_WIRED; + OSAddAtomic(+1, & pmap->stats.wired_count); + } + if (superpage) + template |= INTEL_PTE_PS; + pmap_store_pte(pte, template); + + /* + * if this was a managed page we delayed unlocking the pv until here + * to prevent pmap_page_protect et al from finding it until the pte + * has been stored + */ + if (IS_MANAGED_PAGE(pai)) { + UNLOCK_PVH(pai); + } +Done: + if (need_tlbflush == TRUE) + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); + + if (pvh_e != PV_HASHED_ENTRY_NULL) { + PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1); + } + if (pvh_new != PV_HASHED_ENTRY_NULL) { + PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1); + } + PMAP_UNLOCK(pmap); + + if (delpage_pm_obj) { + vm_page_t m; + + vm_object_lock(delpage_pm_obj); + m = vm_page_lookup(delpage_pm_obj, delpage_pde_index); + if (m == VM_PAGE_NULL) + panic("pmap_enter: pte page not in object"); + VM_PAGE_FREE(m); + OSAddAtomic(-1, &inuse_ptepages_count); + vm_object_unlock(delpage_pm_obj); + } + + PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); +} + +/* + * Remove a range of hardware page-table entries. + * The entries given are the first (inclusive) + * and last (exclusive) entries for the VM pages. + * The virtual address is the va for the first pte. + * + * The pmap must be locked. + * If the pmap is not the kernel pmap, the range must lie + * entirely within one pte-page. This is NOT checked. + * Assumes that the pte-page exists. + */ + +void +pmap_remove_range( + pmap_t pmap, + vm_map_offset_t start_vaddr, + pt_entry_t *spte, + pt_entry_t *epte) +{ + pt_entry_t *cpte; + pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; + pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; + pv_hashed_entry_t pvh_e; + int pvh_cnt = 0; + int num_removed, num_unwired, num_found, num_invalid; + int pai; + pmap_paddr_t pa; + vm_map_offset_t vaddr; + + num_removed = 0; + num_unwired = 0; + num_found = 0; + num_invalid = 0; +#if defined(__i386__) + if (pmap != kernel_pmap && + pmap->pm_task_map == TASK_MAP_32BIT && + start_vaddr >= HIGH_MEM_BASE) { + /* + * The range is in the "high_shared_pde" which is shared + * between the kernel and all 32-bit tasks. It holds + * the 32-bit commpage but also the trampolines, GDT, etc... + * so we can't let user tasks remove anything from it. + */ + return; + } +#endif + /* invalidate the PTEs first to "freeze" them */ + for (cpte = spte, vaddr = start_vaddr; + cpte < epte; + cpte++, vaddr += PAGE_SIZE_64) { + pt_entry_t p = *cpte; + + pa = pte_to_pa(p); + if (pa == 0) + continue; + num_found++; + + if (iswired(p)) + num_unwired++; + + pai = pa_index(pa); + + if (!IS_MANAGED_PAGE(pai)) { + /* + * Outside range of managed physical memory. + * Just remove the mappings. + */ + pmap_store_pte(cpte, 0); + continue; + } + + if ((p & INTEL_PTE_VALID) == 0) + num_invalid++; + + /* invalidate the PTE */ + pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); + } + + if (num_found == 0) { + /* nothing was changed: we're done */ + goto update_counts; + } + + /* propagate the invalidates to other CPUs */ + + PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr); + + for (cpte = spte, vaddr = start_vaddr; + cpte < epte; + cpte++, vaddr += PAGE_SIZE_64) { + + pa = pte_to_pa(*cpte); + if (pa == 0) + continue; + + pai = pa_index(pa); + + LOCK_PVH(pai); + + pa = pte_to_pa(*cpte); + if (pa == 0) { + UNLOCK_PVH(pai); + continue; + } + num_removed++; + + /* + * Get the modify and reference bits, then + * nuke the entry in the page table + */ + /* remember reference and change */ + pmap_phys_attributes[pai] |= + (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED)); + + /* + * Remove the mapping from the pvlist for this physical page. + */ + pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte); + + /* completely invalidate the PTE */ + pmap_store_pte(cpte, 0); + + UNLOCK_PVH(pai); + + if (pvh_e != PV_HASHED_ENTRY_NULL) { + pvh_e->qlink.next = (queue_entry_t) pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) { + pvh_et = pvh_e; + } + pvh_cnt++; + } + } /* for loop */ + + if (pvh_eh != PV_HASHED_ENTRY_NULL) { + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); + } +update_counts: + /* + * Update the counts + */ +#if TESTING + if (pmap->stats.resident_count < num_removed) + panic("pmap_remove_range: resident_count"); +#endif + assert(pmap->stats.resident_count >= num_removed); + OSAddAtomic(-num_removed, &pmap->stats.resident_count); + +#if TESTING + if (pmap->stats.wired_count < num_unwired) + panic("pmap_remove_range: wired_count"); +#endif + assert(pmap->stats.wired_count >= num_unwired); + OSAddAtomic(-num_unwired, &pmap->stats.wired_count); + + return; +} + + +/* + * Remove the given range of addresses + * from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the hardware page size. + */ +void +pmap_remove( + pmap_t map, + addr64_t s64, + addr64_t e64) +{ + pt_entry_t *pde; + pt_entry_t *spte, *epte; + addr64_t l64; + uint64_t deadline; + + pmap_intr_assert(); + + if (map == PMAP_NULL || s64 == e64) + return; + + PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, + map, + (uint32_t) (s64 >> 32), s64, + (uint32_t) (e64 >> 32), e64); + + + PMAP_LOCK(map); + +#if 0 + /* + * Check that address range in the kernel does not overlap the stacks. + * We initialize local static min/max variables once to avoid making + * 2 function calls for every remove. Note also that these functions + * both return 0 before kernel stacks have been initialized, and hence + * the panic is not triggered in this case. + */ + if (map == kernel_pmap) { + static vm_offset_t kernel_stack_min = 0; + static vm_offset_t kernel_stack_max = 0; + + if (kernel_stack_min == 0) { + kernel_stack_min = min_valid_stack_address(); + kernel_stack_max = max_valid_stack_address(); + } + if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) || + (kernel_stack_min < e64 && e64 <= kernel_stack_max)) + panic("pmap_remove() attempted in kernel stack"); + } +#else + + /* + * The values of kernel_stack_min and kernel_stack_max are no longer + * relevant now that we allocate kernel stacks in the kernel map, + * so the old code above no longer applies. If we wanted to check that + * we weren't removing a mapping of a page in a kernel stack we'd + * mark the PTE with an unused bit and check that here. + */ + +#endif + + deadline = rdtsc64() + max_preemption_latency_tsc; + + while (s64 < e64) { + l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1); + if (l64 > e64) + l64 = e64; + pde = pmap_pde(map, s64); + + if (pde && (*pde & INTEL_PTE_VALID)) { + if (*pde & INTEL_PTE_PS) { + /* + * If we're removing a superpage, pmap_remove_range() + * must work on level 2 instead of level 1; and we're + * only passing a single level 2 entry instead of a + * level 1 range. + */ + spte = pde; + epte = spte+1; /* excluded */ + } else { + spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1))); + spte = &spte[ptenum(s64)]; + epte = &spte[intel_btop(l64 - s64)]; + } + pmap_remove_range(map, s64, spte, epte); + } + s64 = l64; + + if (s64 < e64 && rdtsc64() >= deadline) { + PMAP_UNLOCK(map) + PMAP_LOCK(map) + deadline = rdtsc64() + max_preemption_latency_tsc; + } + } + + PMAP_UNLOCK(map); + + PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END, + map, 0, 0, 0, 0); + +} + +/* + * Routine: pmap_page_protect + * + * Function: + * Lower the permission for all mappings to a given + * page. + */ +void +pmap_page_protect( + ppnum_t pn, + vm_prot_t prot) +{ + pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; + pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; + pv_hashed_entry_t nexth; + int pvh_cnt = 0; + pv_rooted_entry_t pv_h; + pv_rooted_entry_t pv_e; + pv_hashed_entry_t pvh_e; + pt_entry_t *pte; + int pai; + pmap_t pmap; + boolean_t remove; + + pmap_intr_assert(); + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) + return; + + pai = ppn_to_pai(pn); + + if (!IS_MANAGED_PAGE(pai)) { + /* + * Not a managed page. + */ + return; + } + PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, + pn, prot, 0, 0, 0); + + /* + * Determine the new protection. + */ + switch (prot) { + case VM_PROT_READ: + case VM_PROT_READ | VM_PROT_EXECUTE: + remove = FALSE; + break; + case VM_PROT_ALL: + return; /* nothing to do */ + default: + remove = TRUE; + break; + } + + pv_h = pai_to_pvh(pai); + + LOCK_PVH(pai); + + + /* + * Walk down PV list, if any, changing or removing all mappings. + */ + if (pv_h->pmap == PMAP_NULL) + goto done; + + pv_e = pv_h; + pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */ + + do { + vm_map_offset_t vaddr; + + pmap = pv_e->pmap; + vaddr = pv_e->va; + pte = pmap_pte(pmap, vaddr); + +#if DEBUG + if (pa_index(pte_to_pa(*pte)) != pn) + panic("pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte); +#endif + if (0 == pte) { + panic("pmap_page_protect() " + "pmap=%p pn=0x%x vaddr=0x%llx\n", + pmap, pn, vaddr); + } + nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink); + + /* + * Remove the mapping if new protection is NONE + * or if write-protecting a kernel mapping. + */ + if (remove || pmap == kernel_pmap) { + /* + * Remove the mapping, collecting dirty bits. + */ + pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + pmap_phys_attributes[pai] |= + *pte & (PHYS_MODIFIED|PHYS_REFERENCED); + pmap_store_pte(pte, 0); + +#if TESTING + if (pmap->stats.resident_count < 1) + panic("pmap_page_protect: resident_count"); +#endif + assert(pmap->stats.resident_count >= 1); + OSAddAtomic(-1, &pmap->stats.resident_count); + + /* + * Deal with the pv_rooted_entry. + */ + + if (pv_e == pv_h) { + /* + * Fix up head later. + */ + pv_h->pmap = PMAP_NULL; + } else { + /* + * Delete this entry. + */ + pv_hash_remove(pvh_e); + pvh_e->qlink.next = (queue_entry_t) pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pvh_cnt++; + } + } else { + /* + * Write-protect. + */ + pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); + PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); + } + pvh_e = nexth; + } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h); + + + /* + * If pv_head mapping was removed, fix it up. + */ + if (pv_h->pmap == PMAP_NULL) { + pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); + + if (pvh_e != (pv_hashed_entry_t) pv_h) { + pv_hash_remove(pvh_e); + pv_h->pmap = pvh_e->pmap; + pv_h->va = pvh_e->va; + pvh_e->qlink.next = (queue_entry_t) pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pvh_cnt++; + } + } + if (pvh_eh != PV_HASHED_ENTRY_NULL) { + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); + } +done: + UNLOCK_PVH(pai); + + PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END, + 0, 0, 0, 0, 0); +} + +__private_extern__ void +pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1,2)) { + if (pmap_pagetable_corruption_incidents > 0) { + int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG); + (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout); + for (i = 0; i < e; i++) { + (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime); + } + } +} + +void +mapping_free_prime(void) +{ + int i; + pv_hashed_entry_t pvh_e; + pv_hashed_entry_t pvh_eh; + pv_hashed_entry_t pvh_et; + int pv_cnt; + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + +} + +static inline void +pmap_pagetable_corruption_log_setup(void) { + if (pmap_pagetable_corruption_log_call == NULL) { + nanotime_to_absolutetime(PMAP_PAGETABLE_CORRUPTION_INTERVAL, 0, &pmap_pagetable_corruption_interval_abstime); + thread_call_setup(&pmap_pagetable_corruption_log_call_data, + (thread_call_func_t) pmap_pagetable_corruption_msg_log, + (thread_call_param_t) &printf); + pmap_pagetable_corruption_log_call = &pmap_pagetable_corruption_log_call_data; + } +} + +void +mapping_adjust(void) +{ + pv_hashed_entry_t pvh_e; + pv_hashed_entry_t pvh_eh; + pv_hashed_entry_t pvh_et; + int pv_cnt; + int i; + + if (mapping_adjust_call == NULL) { + thread_call_setup(&mapping_adjust_call_data, + (thread_call_func_t) mapping_adjust, + (thread_call_param_t) NULL); + mapping_adjust_call = &mapping_adjust_call_data; + } + + pmap_pagetable_corruption_log_setup(); + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) { + for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + } + + pv_cnt = 0; + pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; + if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) { + for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) { + pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); + + pvh_e->qlink.next = (queue_entry_t)pvh_eh; + pvh_eh = pvh_e; + + if (pvh_et == PV_HASHED_ENTRY_NULL) + pvh_et = pvh_e; + pv_cnt++; + } + PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); + } + mappingrecurse = 0; +} + diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 9adfe5b83..d61a26a77 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -595,7 +595,7 @@ ipc_kmsg_alloc( mach_msg_size_t max_desc = (mach_msg_size_t)(((size - sizeof(mach_msg_base_t)) / sizeof(mach_msg_ool_descriptor32_t)) * DESC_SIZE_ADJUSTMENT); - if (msg_and_trailer_size >= MACH_MSG_SIZE_MAX - max_desc) + if (msg_and_trailer_size > MACH_MSG_SIZE_MAX - max_desc) return IKM_NULL; max_expanded_size = msg_and_trailer_size + max_desc; @@ -617,12 +617,9 @@ ipc_kmsg_alloc( assert(i <= IKM_STASH); kmsg = cache->entries[--i]; cache->avail = i; - ikm_check_init(kmsg, max_expanded_size); enable_preemption(); - kmsg->ikm_header = (mach_msg_header_t *) - ((vm_offset_t)(kmsg + 1) + - max_expanded_size - - msg_and_trailer_size); + ikm_check_init(kmsg, max_expanded_size); + ikm_set_header(kmsg, msg_and_trailer_size); return (kmsg); } enable_preemption(); @@ -633,10 +630,7 @@ ipc_kmsg_alloc( if (kmsg != IKM_NULL) { ikm_init(kmsg, max_expanded_size); - kmsg->ikm_header = (mach_msg_header_t *) - ((vm_offset_t)(kmsg + 1) + - max_expanded_size - - msg_and_trailer_size); + ikm_set_header(kmsg, msg_and_trailer_size); } return(kmsg); @@ -1072,6 +1066,23 @@ ipc_kmsg_clear_prealloc( IP_CLEAR_PREALLOC(port, kmsg); } +/* + * Routine: ipc_kmsg_prealloc + * Purpose: + * Wraper to ipc_kmsg_alloc() to account for + * header expansion requirements. + */ +ipc_kmsg_t +ipc_kmsg_prealloc(mach_msg_size_t size) +{ +#if defined(__LP64__) + if (size > MACH_MSG_SIZE_MAX - LEGACY_HEADER_SIZE_DELTA) + return IKM_NULL; + + size += LEGACY_HEADER_SIZE_DELTA; +#endif + return ipc_kmsg_alloc(size); +} /* @@ -1243,10 +1254,9 @@ ipc_kmsg_get_from_kernel( * clients. These are set up for those kernel clients * which cannot afford to wait. */ -#ifndef __LP64__ - /* LP64todo - does the prealloc kmsg need ikm_header padding? - */ if (IP_PREALLOC(dest_port)) { + mach_msg_size_t max_desc = 0; + ip_lock(dest_port); if (!ip_active(dest_port)) { ip_unlock(dest_port); @@ -1254,19 +1264,26 @@ ipc_kmsg_get_from_kernel( } assert(IP_PREALLOC(dest_port)); kmsg = dest_port->ip_premsg; - if (msg_and_trailer_size > kmsg->ikm_size) { - ip_unlock(dest_port); - return MACH_SEND_TOO_LARGE; - } if (ikm_prealloc_inuse(kmsg)) { ip_unlock(dest_port); return MACH_SEND_NO_BUFFER; } +#if !defined(__LP64__) + if (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX) { + assert(size > sizeof(mach_msg_base_t)); + max_desc = ((mach_msg_base_t *)msg)->body.msgh_descriptor_count * + DESC_SIZE_ADJUSTMENT; + } +#endif + if (msg_and_trailer_size > kmsg->ikm_size - max_desc) { + ip_unlock(dest_port); + return MACH_SEND_TOO_LARGE; + } ikm_prealloc_set_inuse(kmsg, dest_port); + ikm_set_header(kmsg, msg_and_trailer_size); ip_unlock(dest_port); } else -#endif /* !__LP64__ */ { kmsg = ipc_kmsg_alloc(msg_and_trailer_size); if (kmsg == IKM_NULL) diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index db4df8ad5..8687cafbf 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -162,6 +162,12 @@ MACRO_BEGIN \ assert((kmsg)->ikm_next == IKM_BOGUS); \ MACRO_END +#define ikm_set_header(kmsg, mtsize) \ +MACRO_BEGIN \ + (kmsg)->ikm_header = (mach_msg_header_t *) \ + ((vm_offset_t)((kmsg) + 1) + (kmsg)->ikm_size - (mtsize)); \ +MACRO_END + struct ipc_kmsg_queue { struct ipc_kmsg *ikmq_base; }; @@ -267,13 +273,16 @@ extern void ipc_kmsg_destroy( extern void ipc_kmsg_destroy_dest( ipc_kmsg_t kmsg); - /* Preallocate a kernel message buffer */ +extern ipc_kmsg_t ipc_kmsg_prealloc( + mach_msg_size_t size); + +/* bind a preallocated message buffer to a port */ extern void ipc_kmsg_set_prealloc( ipc_kmsg_t kmsg, ipc_port_t port); -/* Clear a kernel message buffer */ +/* Clear preallocated message buffer binding */ extern void ipc_kmsg_clear_prealloc( ipc_kmsg_t kmsg, ipc_port_t port); diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index eaa7bad40..76185c9ba 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -107,6 +107,7 @@ decl_lck_mtx_data(, ipc_port_timestamp_lock_data) lck_mtx_ext_t ipc_port_multiple_lock_data_ext; lck_mtx_ext_t ipc_port_timestamp_lock_data_ext; ipc_port_timestamp_t ipc_port_timestamp_data; +int ipc_portbt; #if MACH_ASSERT void ipc_port_init_debug( @@ -1235,8 +1236,14 @@ ipc_port_debug_init(void) { queue_init(&port_alloc_queue); lck_mtx_init_ext(&port_alloc_queue_lock, &port_alloc_queue_lock_ext, &ipc_lck_grp, &ipc_lck_attr); + + if (!PE_parse_boot_argn("ipc_portbt", &ipc_portbt, sizeof (ipc_portbt))) + ipc_portbt = 0; } +#ifdef MACH_BSD +extern int proc_pid(struct proc*); +#endif /* MACH_BSD */ /* * Initialize all of the debugging state in a port. @@ -1255,12 +1262,22 @@ ipc_port_init_debug( for (i = 0; i < IP_NSPARES; ++i) port->ip_spares[i] = 0; +#ifdef MACH_BSD + task_t task = current_task(); + if (task != TASK_NULL) { + struct proc* proc = (struct proc*) get_bsdtask_info(task); + if (proc) + port->ip_spares[0] = proc_pid(proc); + } +#endif /* MACH_BSD */ + /* * Machine-dependent routine to fill in an * array with up to IP_CALLSTACK_MAX levels * of return pc information. */ - machine_callstack(&port->ip_callstack[0], IP_CALLSTACK_MAX); + if (ipc_portbt) + machine_callstack(&port->ip_callstack[0], IP_CALLSTACK_MAX); #if 0 lck_mtx_lock(&port_alloc_queue_lock); diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 7249fe96a..4998a84bc 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -150,8 +150,8 @@ struct ipc_port { #endif #if MACH_ASSERT -#define IP_NSPARES 10 -#define IP_CALLSTACK_MAX 10 +#define IP_NSPARES 4 +#define IP_CALLSTACK_MAX 16 queue_chain_t ip_port_links; /* all allocated ports */ thread_t ip_thread; /* who made me? thread context */ unsigned long ip_timetrack; /* give an idea of "when" created */ diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index 28d0fbc87..389e80bb1 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -644,9 +644,11 @@ mach_port_allocate_full( return KERN_RESOURCE_SHORTAGE; } else { mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE; + if (right != MACH_PORT_RIGHT_RECEIVE) return (KERN_INVALID_VALUE); - kmsg = (ipc_kmsg_t)ipc_kmsg_alloc(size); + + kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size); if (kmsg == IKM_NULL) return (KERN_RESOURCE_SHORTAGE); } diff --git a/osfmk/kdp/kdp.c b/osfmk/kdp/kdp.c index dbacccfd8..df1d6d953 100644 --- a/osfmk/kdp/kdp.c +++ b/osfmk/kdp/kdp.c @@ -29,10 +29,12 @@ #include #include #include +#include #include #include #include +#include #include @@ -115,6 +117,7 @@ int noresume_on_disconnect = 0; extern unsigned int return_on_panic; typedef struct thread_snapshot *thread_snapshot_t; +typedef struct task_snapshot *task_snapshot_t; extern int machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p); @@ -143,7 +146,7 @@ kdp_remove_breakpoint_internal( int -kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_options, uint32_t *pbytesTraced); +kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t dispatch_offset, uint32_t *pbytesTraced); boolean_t kdp_copyin(pmap_t, uint64_t, void *, size_t); extern void bcopy_phys(addr64_t, addr64_t, vm_size_t); @@ -1064,7 +1067,7 @@ kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) { } int -kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_options, uint32_t *pbytesTraced) +kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t dispatch_offset, uint32_t *pbytesTraced) { char *tracepos = (char *) tracebuf; char *tracebound = tracepos + tracebuf_size; @@ -1073,49 +1076,105 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_op task_t task = TASK_NULL; thread_t thread = THREAD_NULL; - int nframes = trace_options; thread_snapshot_t tsnap = NULL; unsigned framesize = 2 * sizeof(vm_offset_t); - boolean_t dispatch_p = ((trace_options & STACKSHOT_GET_DQ) != 0); - uint16_t dispatch_offset = (trace_options & STACKSHOT_DISPATCH_OFFSET_MASK) >> STACKSHOT_DISPATCH_OFFSET_SHIFT; struct task ctask; struct thread cthread; - - if ((nframes <= 0) || nframes > MAX_FRAMES) - nframes = MAX_FRAMES; + + boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0); + boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); queue_iterate(&tasks, task, task_t, tasks) { + int task_pid = pid_from_task(task); + boolean_t task64 = task_has_64BitAddr(task); + if ((task == NULL) || (ml_nofault_copy((vm_offset_t) task, (vm_offset_t) &ctask, sizeof(struct task)) != sizeof(struct task))) goto error_exit; + /* Trace everything, unless a process was specified */ - if ((pid == -1) || (pid == pid_from_task(task))) + if ((pid == -1) || (pid == task_pid)) { + task_snapshot_t task_snap; + uint32_t uuid_info_count; + mach_vm_address_t uuid_info_addr; + + if (save_loadinfo_p && task_pid > 0) { + // Read the dyld_all_image_infos struct from the task memory to get UUID array count and location + if (task64) { + struct dyld_all_image_infos64 task_image_infos; + if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos64))) + goto error_exit; + uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + } else { + struct dyld_all_image_infos task_image_infos; + if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos))) + goto error_exit; + uuid_info_count = task_image_infos.uuidArrayCount; + uuid_info_addr = task_image_infos.uuidArray; + } + } else { + uuid_info_count = 0; + uuid_info_addr = 0; + } + + if (tracepos + sizeof(struct task_snapshot) > tracebound) { + error = -1; + goto error_exit; + } + + task_snap = (task_snapshot_t) tracepos; + task_snap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC; + task_snap->pid = task_pid; + task_snap->nloadinfos = uuid_info_count; + /* Add the BSD process identifiers */ + if (task_pid != -1) + proc_name_kdp(task, task_snap->p_comm, sizeof(task_snap->p_comm)); + else + task_snap->p_comm[0] = '\0'; + task_snap->ss_flags = 0; + if (task64) + task_snap->ss_flags |= kUser64_p; + + tracepos += sizeof(struct task_snapshot); + + if (task_pid > 0 && uuid_info_count > 0) { + uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct dyld_uuid_info64) : sizeof(struct dyld_uuid_info)); + uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; + + if (tracepos + uuid_info_array_size > tracebound) { + error = -1; + goto error_exit; + } + + // Copy in the UUID info array + if (!kdp_copyin(task->map->pmap, uuid_info_addr, tracepos, uuid_info_array_size)) + goto error_exit; + + tracepos += uuid_info_array_size; + } + queue_iterate(&task->threads, thread, thread_t, task_threads){ if ((thread == NULL) || (ml_nofault_copy((vm_offset_t) thread, (vm_offset_t) &cthread, sizeof(struct thread)) != sizeof(struct thread))) goto error_exit; + if (((tracepos + 4 * sizeof(struct thread_snapshot)) > tracebound)) { error = -1; goto error_exit; } -/* Populate the thread snapshot header */ + /* Populate the thread snapshot header */ tsnap = (thread_snapshot_t) tracepos; tsnap->thread_id = (uint64_t) (uintptr_t)thread; tsnap->state = thread->state; tsnap->wait_event = thread->wait_event; tsnap->continuation = (uint64_t) (uintptr_t) thread->continuation; -/* Add the BSD process identifiers */ - if ((tsnap->pid = pid_from_task(task)) != -1) - proc_name_kdp(task, tsnap->p_comm, sizeof(tsnap->p_comm)); - else - tsnap->p_comm[0] = '\0'; - tsnap->snapshot_magic = 0xfeedface; + tsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC; tracepos += sizeof(struct thread_snapshot); tsnap->ss_flags = 0; if (dispatch_p && (task != kernel_task) && (task->active) && (task->map)) { uint64_t dqkeyaddr = thread_dispatchqaddr(thread); if (dqkeyaddr != 0) { - boolean_t task64 = task_has_64BitAddr(task); uint64_t dqaddr = 0; if (kdp_copyin(task->map->pmap, dqkeyaddr, &dqaddr, (task64 ? 8 : 4)) && (dqaddr != 0)) { uint64_t dqserialnumaddr = dqaddr + dispatch_offset; @@ -1133,27 +1192,27 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_op */ if (thread->kernel_stack != 0) { #if defined(__LP64__) - tracebytes = machine_trace_thread64(thread, tracepos, tracebound, nframes, FALSE); + tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, FALSE); tsnap->ss_flags |= kKernel64_p; framesize = 16; #else - tracebytes = machine_trace_thread(thread, tracepos, tracebound, nframes, FALSE); + tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, FALSE); framesize = 8; #endif } tsnap->nkern_frames = tracebytes/framesize; tracepos += tracebytes; tracebytes = 0; -/* Trace user stack, if any */ + /* Trace user stack, if any */ if (thread->task->map != kernel_map) { /* 64-bit task? */ if (task_has_64BitAddr(thread->task)) { - tracebytes = machine_trace_thread64(thread, tracepos, tracebound, nframes, TRUE); + tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, TRUE); tsnap->ss_flags |= kUser64_p; framesize = 16; } else { - tracebytes = machine_trace_thread(thread, tracepos, tracebound, nframes, TRUE); + tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, TRUE); framesize = 8; } } @@ -1161,6 +1220,7 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_op tracepos += tracebytes; tracebytes = 0; } + } } error_exit: diff --git a/osfmk/kdp/kdp_dyld.h b/osfmk/kdp/kdp_dyld.h new file mode 100644 index 000000000..ef228574e --- /dev/null +++ b/osfmk/kdp/kdp_dyld.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Data structure definitions copied from dyld so that we can read dyld's saved UUID information + * for each binary image not loaded from the shared cache during stackshots. + */ + +/* From dyld/include/dyld_images.h */ + +struct dyld_uuid_info { + user32_addr_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +struct dyld_uuid_info64 { + user64_addr_t imageLoadAddress; /* base address image is mapped into */ + uuid_t imageUUID; /* UUID of image */ +}; + +// FIXME: dyld is in C++, and some of the fields in dyld_all_image_infos are C++ +// native booleans. There must be a better way... +typedef uint8_t bool; + +struct dyld_all_image_infos { + uint32_t version; + uint32_t infoArrayCount; + user32_addr_t infoArray; + user32_addr_t notification; + bool processDetachedFromSharedRegion; + bool libSystemInitialized; + user32_addr_t dyldImageLoadAddress; + user32_addr_t jitInfo; + user32_addr_t dyldVersion; + user32_addr_t errorMessage; + user32_addr_t terminationFlags; + user32_addr_t coreSymbolicationShmPage; + user32_addr_t systemOrderFlag; + user32_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count + user32_addr_t uuidArray; +}; + +struct dyld_all_image_infos64 { + uint32_t version; + uint32_t infoArrayCount; + user64_addr_t infoArray; + user64_addr_t notification; + bool processDetachedFromSharedRegion; + bool libSystemInitialized; + user64_addr_t dyldImageLoadAddress; + user64_addr_t jitInfo; + user64_addr_t dyldVersion; + user64_addr_t errorMessage; + user64_addr_t terminationFlags; + user64_addr_t coreSymbolicationShmPage; + user64_addr_t systemOrderFlag; + user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count + user64_addr_t uuidArray; +}; diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index 5cab18769..0a54c5f2e 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -195,20 +195,21 @@ static unsigned stack_snapshot_bytes_traced = 0; static void *stack_snapshot_buf; static uint32_t stack_snapshot_bufsize; static int stack_snapshot_pid; -static uint32_t stack_snapshot_options; +static uint32_t stack_snapshot_flags; +static uint32_t stack_snapshot_dispatch_offset; static unsigned int old_debugger; void kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, - uint32_t options); + uint32_t flags, uint32_t dispatch_offset); void kdp_snapshot_postflight(void); extern int kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, - unsigned trace_options, uint32_t *pbytesTraced); + uint32_t flags, uint32_t dispatch_offset, uint32_t *pbytesTraced); int kdp_stack_snapshot_geterror(void); @@ -308,12 +309,13 @@ kdp_unregister_send_receive( /* Cache stack snapshot parameters in preparation for a trace */ void -kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t options) +kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) { stack_snapshot_pid = pid; stack_snapshot_buf = tracebuf; stack_snapshot_bufsize = tracebuf_size; - stack_snapshot_options = options; + stack_snapshot_flags = flags; + stack_snapshot_dispatch_offset = dispatch_offset; kdp_snapshot++; /* Mark this debugger as active, since the polled mode driver that * ordinarily does this may not be enabled (yet), or since KDB may be @@ -1114,7 +1116,8 @@ kdp_raise_exception( if (kdp_snapshot && (!panic_active()) && (panic_caller == 0)) { stack_snapshot_ret = kdp_stackshot(stack_snapshot_pid, stack_snapshot_buf, stack_snapshot_bufsize, - stack_snapshot_options, &stack_snapshot_bytes_traced); + stack_snapshot_flags, stack_snapshot_dispatch_offset, + &stack_snapshot_bytes_traced); return; } diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index acec72979..39aa1f425 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -431,16 +431,19 @@ extern const char version[]; extern char osversion[]; __private_extern__ void panic_display_system_configuration(void) { - static boolean_t config_displayed = FALSE; + static volatile boolean_t config_displayed = FALSE; panic_display_process_name(); if (config_displayed == FALSE) { + config_displayed = TRUE; kdb_printf("\nMac OS version:\n%s\n", (osversion[0] != 0) ? osversion : "Not yet set"); kdb_printf("\nKernel version:\n%s\n",version); panic_display_model_name(); panic_display_uptime(); - config_displayed = TRUE; +#if defined(__i386__) || defined(__x86_64__) + pmap_pagetable_corruption_msg_log(&kdb_printf); +#endif /* i386 || x86_64 */ panic_display_zprint(); kext_dump_panic_lists(&kdb_log); } diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index d4ad172b9..308435ece 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -39,12 +39,18 @@ struct thread_snapshot { uint32_t snapshot_magic; uint32_t nkern_frames; uint32_t nuser_frames; - int32_t pid; uint64_t wait_event; uint64_t continuation; uint64_t thread_id; int32_t state; char ss_flags; +} __attribute__ ((packed)); + +struct task_snapshot { + uint32_t snapshot_magic; + int32_t pid; + uint32_t nloadinfos; + char ss_flags; /* We restrict ourselves to a statically defined * (current as of 2009) length for the * p_comm string, due to scoping issues (osfmk/bsd and user/kernel @@ -59,9 +65,13 @@ enum { kHasDispatchSerial = 0x4 }; -enum {STACKSHOT_GET_DQ = 1}; -#define STACKSHOT_DISPATCH_OFFSET_MASK 0xffff0000 -#define STACKSHOT_DISPATCH_OFFSET_SHIFT 16 +enum { + STACKSHOT_GET_DQ = 0x1, + STACKSHOT_SAVE_LOADINFO = 0x2 +}; + +#define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface +#define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad #endif /* __APPLE_API_UNSTABLE */ #endif /* __APPLE_API_PRIVATE */ @@ -70,6 +80,7 @@ enum {STACKSHOT_GET_DQ = 1}; extern unsigned int systemLogDiags; extern char debug_buf[]; +extern unsigned int debug_boot_arg; #ifdef MACH_KERNEL_PRIVATE diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 341069724..ca65ceca6 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -155,6 +155,15 @@ processor_init( processor_data_init(processor); processor->processor_list = NULL; + pset_lock(pset); + if (pset->cpu_set_count++ == 0) + pset->cpu_set_low = pset->cpu_set_hi = cpu_id; + else { + pset->cpu_set_low = (cpu_id < pset->cpu_set_low)? cpu_id: pset->cpu_set_low; + pset->cpu_set_hi = (cpu_id > pset->cpu_set_hi)? cpu_id: pset->cpu_set_hi; + } + pset_unlock(pset); + simple_lock(&processor_list_lock); if (processor_list == NULL) processor_list = processor; @@ -231,6 +240,8 @@ pset_init( queue_init(&pset->idle_queue); pset->processor_count = 0; pset->low_pri = pset->low_count = PROCESSOR_NULL; + pset->cpu_set_low = pset->cpu_set_hi = 0; + pset->cpu_set_count = 0; pset_lock_init(pset); pset->pset_self = IP_NULL; pset->pset_name_self = IP_NULL; diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index fcf61d044..342a90081 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -89,6 +89,9 @@ struct processor_set { int processor_count; + int cpu_set_low, cpu_set_hi; + int cpu_set_count; + decl_simple_lock_data(,sched_lock) /* lock for above */ struct ipc_port * pset_self; /* port for operations */ @@ -244,11 +247,15 @@ extern kern_return_t processor_info_count( #define pset_deallocate(x) #define pset_reference(x) -extern void machine_run_count( - uint32_t count); +extern void machine_run_count( + uint32_t count); + +extern boolean_t machine_processor_is_inactive( + processor_t processor); -extern boolean_t machine_cpu_is_inactive( - int cpu_id); +extern processor_t machine_choose_processor( + processor_set_t pset, + processor_t processor); #else /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index e1e5ae4c0..9a153ea29 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -158,6 +158,7 @@ #define BASEPRI_FOREGROUND (BASEPRI_DEFAULT + 16) /* 47 */ #define BASEPRI_BACKGROUND (BASEPRI_DEFAULT + 15) /* 46 */ #define BASEPRI_DEFAULT (MAXPRI_USER - (NRQS / 4)) /* 31 */ +#define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */ #define MINPRI_USER MINPRI /* 0 */ /* diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 2dc656aec..60191650c 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1104,7 +1104,7 @@ thread_select( pset_lock(pset); - inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_cpu_is_inactive(processor->cpu_id); + inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_processor_is_inactive(processor); simple_lock(&rt_lock); @@ -1680,8 +1680,7 @@ thread_dispatch( thread->realtime.deadline = UINT64_MAX; thread->reason |= AST_QUANTUM; } - } - else { + } else { /* * For non-realtime threads treat a tiny * remaining quantum as an expired quantum @@ -1726,12 +1725,25 @@ thread_dispatch( /* * Waiting. */ + boolean_t should_terminate = FALSE; + + /* Only the first call to thread_dispatch + * after explicit termination should add + * the thread to the termination queue + */ + if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { + should_terminate = TRUE; + thread->state |= TH_TERMINATE2; + } + thread->state &= ~TH_RUN; if (thread->sched_mode & TH_MODE_TIMESHARE) sched_share_decr(); sched_run_decr(); + (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + if (thread->wake_active) { thread->wake_active = FALSE; thread_unlock(thread); @@ -1743,9 +1755,7 @@ thread_dispatch( wake_unlock(thread); - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); - - if (thread->state & TH_TERMINATE) + if (should_terminate) thread_terminate_enqueue(thread); } } @@ -2232,6 +2242,7 @@ choose_next_pset( * choose_processor: * * Choose a processor for the thread, beginning at + * the pset. Accepts an optional processor hint in * the pset. * * Returns a processor, possibly from a different pset. @@ -2242,19 +2253,25 @@ choose_next_pset( static processor_t choose_processor( processor_set_t pset, + processor_t processor, thread_t thread) { processor_set_t nset, cset = pset; - processor_t processor = thread->last_processor; processor_meta_t pmeta = PROCESSOR_META_NULL; /* - * Prefer the last processor, when appropriate. + * Prefer the hinted processor, when appropriate. */ if (processor != PROCESSOR_NULL) { + processor_t mprocessor; + if (processor->processor_meta != PROCESSOR_META_NULL) processor = processor->processor_meta->primary; + mprocessor = machine_choose_processor(pset, processor); + if (mprocessor != PROCESSOR_NULL) + processor = mprocessor; + if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE || processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE) processor = PROCESSOR_NULL; @@ -2262,6 +2279,18 @@ choose_processor( if (processor->state == PROCESSOR_IDLE) return (processor); } + else { + processor = machine_choose_processor(pset, processor); + + if (processor != PROCESSOR_NULL) { + if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE || + processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE) + processor = PROCESSOR_NULL; + else + if (processor->state == PROCESSOR_IDLE) + return (processor); + } + } /* * Iterate through the processor sets to locate @@ -2447,7 +2476,7 @@ thread_setrun( pset = thread->affinity_set->aset_pset; pset_lock(pset); - processor = choose_processor(pset, thread); + processor = choose_processor(pset, PROCESSOR_NULL, thread); } else if (thread->last_processor != PROCESSOR_NULL) { @@ -2468,10 +2497,10 @@ thread_setrun( */ if (thread->sched_pri <= processor->current_pri || thread->realtime.deadline >= processor->deadline) - processor = choose_processor(pset, thread); + processor = choose_processor(pset, PROCESSOR_NULL, thread); } else - processor = choose_processor(pset, thread); + processor = choose_processor(pset, processor, thread); } else { /* @@ -2489,7 +2518,7 @@ thread_setrun( pset = choose_next_pset(pset); pset_lock(pset); - processor = choose_processor(pset, thread); + processor = choose_processor(pset, PROCESSOR_NULL, thread); task->pset_hint = processor->processor_set; } } @@ -2645,7 +2674,7 @@ csw_check( processor->processor_meta->primary != processor) return (AST_PREEMPT); - if (machine_cpu_is_inactive(processor->cpu_id)) + if (machine_processor_is_inactive(processor)) return (AST_PREEMPT); if (processor->active_thread->state & TH_SUSP) @@ -2925,7 +2954,7 @@ processor_idle( (void)splsched(); - if (processor->state == PROCESSOR_INACTIVE && !machine_cpu_is_inactive(processor->cpu_id)) + if (processor->state == PROCESSOR_INACTIVE && !machine_processor_is_inactive(processor)) break; } diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index d47b67c52..9f1c95347 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -159,9 +159,6 @@ extern void idle_thread(void); extern kern_return_t idle_thread_create( processor_t processor); -/* Start thread running */ -extern void thread_bootstrap_return(void); - /* Continuation return from syscall */ extern void thread_syscall_return( kern_return_t ret); @@ -225,6 +222,9 @@ extern kern_return_t clear_wait( thread_t thread, wait_result_t result); +/* Start thread running */ +extern void thread_bootstrap_return(void); + /* Return from exception (BSD-visible interface) */ extern void thread_exception_return(void) __dead2; diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index 0f027820d..d3395ddb4 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -105,6 +105,16 @@ task_policy_set( task->role = info->role; } } + else + if (info->role == TASK_THROTTLE_APPLICATION) { + task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); + task->role = info->role; + } + else + if (info->role == TASK_DEFAULT_APPLICATION) { + task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER); + task->role = info->role; + } else result = KERN_INVALID_ARGUMENT; diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index b33a7d2be..581a37c7f 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -822,6 +822,7 @@ thread_create_running( kern_return_t thread_create_workq( task_t task, + thread_continue_t thread_return, thread_t *new_thread) { kern_return_t result; @@ -830,8 +831,7 @@ thread_create_workq( if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); - result = thread_create_internal(task, -1, (thread_continue_t)thread_bootstrap_return, - TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread); + result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread); if (result != KERN_SUCCESS) return (result); diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 61217f52e..db2c6e352 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -178,6 +178,7 @@ struct thread { #define TH_RUN 0x04 /* running or on runq */ #define TH_UNINT 0x08 /* waiting uninteruptibly */ #define TH_TERMINATE 0x10 /* halted at termination */ +#define TH_TERMINATE2 0x20 /* added to termination queue */ #define TH_IDLE 0x80 /* idling processor */ @@ -640,6 +641,7 @@ __BEGIN_DECLS extern kern_return_t thread_create_workq( task_t task, + thread_continue_t thread_return, thread_t *new_thread); extern void thread_yield_internal( diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index ab9bab486..92f0b642b 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -59,6 +59,7 @@ struct thread_call_group { timer_call_data_t delayed_timer; struct wait_queue idle_wqueue; + struct wait_queue daemon_wqueue; uint32_t idle_count, active_count; }; @@ -149,6 +150,7 @@ thread_call_initialize(void) timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group); wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO); + wait_queue_init(&group->daemon_wqueue, SYNC_POLICY_FIFO); queue_init(&thread_call_internal_queue); for ( @@ -772,7 +774,7 @@ thread_call_wake( else if (!thread_call_daemon_awake) { thread_call_daemon_awake = TRUE; - thread_wakeup_one(&thread_call_daemon_awake); + wait_queue_wakeup_one(&group->daemon_wqueue, NULL, THREAD_AWAKENED); } } @@ -901,8 +903,8 @@ thread_call_daemon_continue( simple_lock(&thread_call_lock); } - thread_call_daemon_awake = FALSE; - assert_wait(&thread_call_daemon_awake, THREAD_UNINT); + thread_call_daemon_awake = FALSE; + wait_queue_assert_wait(&group->daemon_wqueue, NULL, THREAD_UNINT, 0); simple_unlock(&thread_call_lock); (void) spllo(); diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index a07f4c98f..3a2fb39c4 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -109,7 +109,9 @@ enum task_role { TASK_FOREGROUND_APPLICATION, TASK_BACKGROUND_APPLICATION, TASK_CONTROL_APPLICATION, - TASK_GRAPHICS_SERVER + TASK_GRAPHICS_SERVER, + TASK_THROTTLE_APPLICATION, + TASK_DEFAULT_APPLICATION }; typedef enum task_role task_role_t; diff --git a/osfmk/mach/vm_prot.h b/osfmk/mach/vm_prot.h index eed510771..6fe17d43c 100644 --- a/osfmk/mach/vm_prot.h +++ b/osfmk/mach/vm_prot.h @@ -130,4 +130,12 @@ typedef int vm_prot_t; #define VM_PROT_WANTS_COPY ((vm_prot_t) 0x10) +/* + * The caller wants this memory region treated as if it had a valid + * code signature. + */ + +#define VM_PROT_TRUSTED ((vm_prot_t) 0x20) + + #endif /* _MACH_VM_PROT_H_ */ diff --git a/osfmk/ppc/machine_routines.c b/osfmk/ppc/machine_routines.c index 7edacae01..bc79f0c7c 100644 --- a/osfmk/ppc/machine_routines.c +++ b/osfmk/ppc/machine_routines.c @@ -820,11 +820,17 @@ machine_run_count(__unused uint32_t count) } boolean_t -machine_cpu_is_inactive(__unused int num) +machine_processor_is_inactive(__unused processor_t processor) { return(FALSE); } +processor_t +machine_choose_processor(__unused processor_set_t pset, processor_t processor) +{ + return (processor); +} + vm_offset_t ml_stack_remaining(void) { uintptr_t local = (uintptr_t) &local; diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index cc652d4f8..b339dbd7d 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -2351,7 +2351,7 @@ vm_fault_enter(vm_page_t m, /* Page might have been tainted before or not; now it * definitively is. If the page wasn't tainted, we must * disconnect it from all pmaps later. */ - must_disconnect = ~m->cs_tainted; + must_disconnect = !m->cs_tainted; m->cs_tainted = TRUE; cs_enter_tainted_accepted++; } diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index d48a044fa..a0f5e8c9b 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -4704,6 +4704,8 @@ vm_map_submap_pmap_clean( submap_end = offset + (end - start); submap_start = offset; + + vm_map_lock_read(sub_map); if(vm_map_lookup_entry(sub_map, offset, &entry)) { remove_size = (entry->vme_end - entry->vme_start); @@ -4775,7 +4777,8 @@ vm_map_submap_pmap_clean( } } entry = entry->vme_next; - } + } + vm_map_unlock_read(sub_map); return; } @@ -12547,3 +12550,95 @@ void vm_map_switch_protect(vm_map_t map, map->switch_protect=val; vm_map_unlock(map); } + +/* Add (generate) code signature for memory range */ +#if CONFIG_DYNAMIC_CODE_SIGNING +kern_return_t vm_map_sign(vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) +{ + vm_map_entry_t entry; + vm_page_t m; + vm_object_t object; + + /* + * Vet all the input parameters and current type and state of the + * underlaying object. Return with an error if anything is amiss. + */ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + vm_map_lock_read(map); + + if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) { + /* + * Must pass a valid non-submap address. + */ + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + + if((entry->vme_start > start) || (entry->vme_end < end)) { + /* + * Map entry doesn't cover the requested range. Not handling + * this situation currently. + */ + vm_map_unlock_read(map); + return(KERN_INVALID_ARGUMENT); + } + + object = entry->object.vm_object; + if (object == VM_OBJECT_NULL) { + /* + * Object must already be present or we can't sign. + */ + vm_map_unlock_read(map); + return KERN_INVALID_ARGUMENT; + } + + vm_object_lock(object); + vm_map_unlock_read(map); + + while(start < end) { + uint32_t refmod; + + m = vm_page_lookup(object, start - entry->vme_start + entry->offset ); + if (m==VM_PAGE_NULL) { + /* shoud we try to fault a page here? we can probably + * demand it exists and is locked for this request */ + vm_object_unlock(object); + return KERN_FAILURE; + } + /* deal with special page status */ + if (m->busy || + (m->unusual && (m->error || m->restart || m->private || m->absent))) { + vm_object_unlock(object); + return KERN_FAILURE; + } + + /* Page is OK... now "validate" it */ + /* This is the place where we'll call out to create a code + * directory, later */ + m->cs_validated = TRUE; + + /* The page is now "clean" for codesigning purposes. That means + * we don't consider it as modified (wpmapped) anymore. But + * we'll disconnect the page so we note any future modification + * attempts. */ + m->wpmapped = FALSE; + refmod = pmap_disconnect(m->phys_page); + + /* Pull the dirty status from the pmap, since we cleared the + * wpmapped bit */ + if ((refmod & VM_MEM_MODIFIED) && !m->dirty) { + m->dirty = TRUE; + } + + /* On to the next page */ + start += PAGE_SIZE; + } + vm_object_unlock(object); + + return KERN_SUCCESS; +} +#endif diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index f520087ed..09eaa7473 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -1024,6 +1024,12 @@ extern kern_return_t vm_map_get_upl( int *flags, int force_data_sync); +#if CONFIG_DYNAMIC_CODE_SIGNING +extern kern_return_t vm_map_sign(vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); +#endif + __END_DECLS #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/x86_64/loose_ends.c b/osfmk/x86_64/loose_ends.c index 5ad70b323..e8a1605a7 100644 --- a/osfmk/x86_64/loose_ends.c +++ b/osfmk/x86_64/loose_ends.c @@ -663,14 +663,15 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, pmap = thread->map->pmap; + + assert((vm_offset_t)kernel_addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS || + copy_type == COPYINPHYS || copy_type == COPYOUTPHYS); + /* Sanity and security check for addresses to/from a user */ - if ((copy_type == COPYIN || - copy_type == COPYINSTR || - copy_type == COPYOUT) && - (pmap != kernel_pmap) && - ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS || - !IS_USERADDR64_CANONICAL(user_addr))) { - error = EACCES; + + if (((pmap != kernel_pmap) && (use_kernel_map == 0)) && + ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) { + error = EFAULT; goto out; } diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 13c439a96..e53843224 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -90,7 +90,6 @@ */ #include -#include #include #include @@ -147,15 +146,6 @@ #include -/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ -#ifdef DEBUGINTERRUPTS -#define pmap_intr_assert() { \ - if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ - panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ -} -#else -#define pmap_intr_assert() -#endif #ifdef IWANTTODEBUG #undef DEBUG @@ -178,11 +168,6 @@ boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */ * Forward declarations for internal functions. */ -void pmap_remove_range( - pmap_t pmap, - vm_map_offset_t va, - pt_entry_t *spte, - pt_entry_t *epte); void phys_attribute_clear( ppnum_t phys, @@ -209,166 +194,12 @@ int allow_stack_exec = 0; /* No apps may execute from the stack by default */ const boolean_t cpu_64bit = TRUE; /* Mais oui! */ -/* - * when spinning through pmap_remove - * ensure that we don't spend too much - * time with preemption disabled. - * I'm setting the current threshold - * to 20us - */ -#define MAX_PREEMPTION_LATENCY_NS 20000 - uint64_t max_preemption_latency_tsc = 0; - -/* - * Private data structures. - */ - -/* - * For each vm_page_t, there is a list of all currently - * valid virtual mappings of that page. An entry is - * a pv_rooted_entry_t; the list is the pv_table. - * - * N.B. with the new combo rooted/hashed scheme it is - * only possibly to remove individual non-rooted entries - * if they are found via the hashed chains as there is no - * way to unlink the singly linked hashed entries if navigated to - * via the queue list off the rooted entries. Think of it as - * hash/walk/pull, keeping track of the prev pointer while walking - * the singly linked hash list. All of this is to save memory and - * keep both types of pv_entries as small as possible. - */ - -/* - -PV HASHING Changes - JK 1/2007 - -Pve's establish physical to virtual mappings. These are used for aliasing of a -physical page to (potentially many) virtual addresses within pmaps. In the -previous implementation the structure of the pv_entries (each 16 bytes in size) was - -typedef struct pv_entry { - struct pv_entry_t next; - pmap_t pmap; - vm_map_offset_t va; -} *pv_entry_t; - -An initial array of these is created at boot time, one per physical page of -memory, indexed by the physical page number. Additionally, a pool of entries -is created from a pv_zone to be used as needed by pmap_enter() when it is -creating new mappings. Originally, we kept this pool around because the code -in pmap_enter() was unable to block if it needed an entry and none were -available - we'd panic. Some time ago I restructured the pmap_enter() code -so that for user pmaps it can block while zalloc'ing a pv structure and restart, -removing a panic from the code (in the case of the kernel pmap we cannot block -and still panic, so, we keep a separate hot pool for use only on kernel pmaps). -The pool has not been removed since there is a large performance gain keeping -freed pv's around for reuse and not suffering the overhead of zalloc for every -new pv we need. - -As pmap_enter() created new mappings it linked the new pve's for them off the -fixed pv array for that ppn (off the next pointer). These pve's are accessed -for several operations, one of them being address space teardown. In that case, -we basically do this - - for (every page/pte in the space) { - calc pve_ptr from the ppn in the pte - for (every pv in the list for the ppn) { - if (this pv is for this pmap/vaddr) { - do housekeeping - unlink/free the pv - } - } - } - -The problem arose when we were running, say 8000 (or even 2000) apache or -other processes and one or all terminate. The list hanging off each pv array -entry could have thousands of entries. We were continuously linearly searching -each of these lists as we stepped through the address space we were tearing -down. Because of the locks we hold, likely taking a cache miss for each node, -and interrupt disabling for MP issues the system became completely unresponsive -for many seconds while we did this. - -Realizing that pve's are accessed in two distinct ways (linearly running the -list by ppn for operations like pmap_page_protect and finding and -modifying/removing a single pve as part of pmap_enter processing) has led to -modifying the pve structures and databases. - -There are now two types of pve structures. A "rooted" structure which is -basically the original structure accessed in an array by ppn, and a ''hashed'' -structure accessed on a hash list via a hash of [pmap, vaddr]. These have been -designed with the two goals of minimizing wired memory and making the lookup of -a ppn faster. Since a vast majority of pages in the system are not aliased -and hence represented by a single pv entry I've kept the rooted entry size as -small as possible because there is one of these dedicated for every physical -page of memory. The hashed pve's are larger due to the addition of the hash -link and the ppn entry needed for matching while running the hash list to find -the entry we are looking for. This way, only systems that have lots of -aliasing (like 2000+ httpd procs) will pay the extra memory price. Both -structures have the same first three fields allowing some simplification in -the code. - -They have these shapes - -typedef struct pv_rooted_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; -} *pv_rooted_entry_t; - - -typedef struct pv_hashed_entry { - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; -} *pv_hashed_entry_t; - -The main flow difference is that the code is now aware of the rooted entry and -the hashed entries. Code that runs the pv list still starts with the rooted -entry and then continues down the qlink onto the hashed entries. Code that is -looking up a specific pv entry first checks the rooted entry and then hashes -and runs the hash list for the match. The hash list lengths are much smaller -than the original pv lists that contained all aliases for the specific ppn. - -*/ - -typedef struct pv_rooted_entry { - /* first three entries must match pv_hashed_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; /* virtual address for mapping */ - pmap_t pmap; /* pmap where mapping lies */ -} *pv_rooted_entry_t; - -#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) - -pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ - -typedef struct pv_hashed_entry { - /* first three entries must match pv_rooted_entry_t */ - queue_head_t qlink; - vm_map_offset_t va; - pmap_t pmap; - ppnum_t ppn; - struct pv_hashed_entry *nexth; -} *pv_hashed_entry_t; - -#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) - -#define NPVHASH 4095 /* MUST BE 2^N - 1 */ pv_hashed_entry_t *pv_hash_table; /* hash lists */ uint32_t npvhash = 0; -//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ -#ifdef PV_DEBUG -#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); -#else -#define CHK_NPVHASH(x) -#endif - pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL; pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL; decl_simple_lock_data(,pv_hashed_free_list_lock) @@ -377,53 +208,7 @@ decl_simple_lock_data(,pv_hash_table_lock) int pv_hashed_free_count = 0; int pv_hashed_kern_free_count = 0; -#define PV_HASHED_LOW_WATER_MARK 5000 -#define PV_HASHED_KERN_LOW_WATER_MARK 100 -#define PV_HASHED_ALLOC_CHUNK 2000 -#define PV_HASHED_KERN_ALLOC_CHUNK 50 -thread_call_t mapping_adjust_call; -static thread_call_data_t mapping_adjust_call_data; -uint32_t mappingrecurse = 0; - -#define PV_HASHED_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_free_list_lock); \ - if ((pvh_e = pv_hashed_free_list) != 0) { \ - pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_free_count--; \ - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_free_list_lock); \ -} - -#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \ - pv_hashed_free_list = pvh_eh; \ - pv_hashed_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_free_list_lock); \ -} - -#define PV_HASHED_KERN_ALLOC(pvh_e) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - if ((pvh_e = pv_hashed_kern_free_list) != 0) { \ - pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \ - pv_hashed_kern_free_count--; \ - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK)\ - if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \ - thread_call_enter(mapping_adjust_call); \ - } \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ -} -#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \ - simple_lock(&pv_hashed_kern_free_list_lock); \ - pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \ - pv_hashed_kern_free_list = pvh_eh; \ - pv_hashed_kern_free_count += pv_cnt; \ - simple_unlock(&pv_hashed_kern_free_list_lock); \ -} zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ @@ -436,10 +221,10 @@ static zone_t pdpt_zone; */ char *pv_lock_table; /* pointer to array of bits */ -#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) + char *pv_hash_lock_table; -#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) + /* * First and last physical addresses that we maintain any information @@ -453,97 +238,13 @@ static struct vm_object kpml4obj_object_store; static struct vm_object kpdptobj_object_store; /* - * Index into pv_head table, its lock bits, and the modify/reference and managed bits - */ - -#define pa_index(pa) (i386_btop(pa)) -#define ppn_to_pai(ppn) ((int)ppn) - -#define pai_to_pvh(pai) (&pv_head_table[pai]) -#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) -#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) - -static inline uint32_t -pvhashidx(pmap_t pmap, vm_offset_t va) -{ - return ((uint32_t)(uint64_t)pmap ^ - ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & - npvhash; -} -#define pvhash(idx) (&pv_hash_table[idx]) - -#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) -#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) - -/* - * Array of physical page attribites for managed pages. + * Array of physical page attributes for managed pages. * One byte per physical page. */ char *pmap_phys_attributes; unsigned int last_managed_page = 0; -#define IS_MANAGED_PAGE(x) \ - ((unsigned int)(x) <= last_managed_page && \ - (pmap_phys_attributes[x] & PHYS_MANAGED)) - -/* - * Physical page attributes. Copy bits from PTE definition. - */ -#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ -#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ -#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ - -/* - * Amount of virtual memory mapped by one - * page-directory entry. - */ -#define PDE_MAPPED_SIZE (pdetova(1)) uint64_t pde_mapped_size = PDE_MAPPED_SIZE; -/* - * Locking and TLB invalidation - */ - -/* - * Locking Protocols: (changed 2/2007 JK) - * - * There are two structures in the pmap module that need locking: - * the pmaps themselves, and the per-page pv_lists (which are locked - * by locking the pv_lock_table entry that corresponds to the pv_head - * for the list in question.) Most routines want to lock a pmap and - * then do operations in it that require pv_list locking -- however - * pmap_remove_all and pmap_copy_on_write operate on a physical page - * basis and want to do the locking in the reverse order, i.e. lock - * a pv_list and then go through all the pmaps referenced by that list. - * - * The system wide pmap lock has been removed. Now, paths take a lock - * on the pmap before changing its 'shape' and the reverse order lockers - * (coming in by phys ppn) take a lock on the corresponding pv and then - * retest to be sure nothing changed during the window before they locked - * and can then run up/down the pv lists holding the list lock. This also - * lets the pmap layer run (nearly completely) interrupt enabled, unlike - * previously. - */ - -/* - * PV locking - */ - -#define LOCK_PVH(index) { \ - mp_disable_preemption(); \ - lock_pvh_pai(index); \ -} - -#define UNLOCK_PVH(index) { \ - unlock_pvh_pai(index); \ - mp_enable_preemption(); \ -} -/* - * PV hash locking - */ - -#define LOCK_PV_HASH(hash) lock_hash_hash(hash) -#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) - unsigned pmap_memory_region_count; unsigned pmap_memory_region_current; @@ -562,8 +263,6 @@ pd_entry_t commpage64_pde; struct zone *pmap_zone; /* zone of pmap structures */ -int pmap_debug = 0; /* flag for debugging prints */ - unsigned int inuse_ptepages_count = 0; addr64_t kernel64_cr3; @@ -585,170 +284,6 @@ static int nkpt; pt_entry_t *DMAP1, *DMAP2; caddr_t DADDR1; caddr_t DADDR2; - -/* - * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. - * properly deals with the anchor. - * must be called with the hash locked, does not unlock it - */ - -static inline void -pmap_pvh_unlink(pv_hashed_entry_t pvh) -{ - pv_hashed_entry_t curh; - pv_hashed_entry_t *pprevh; - int pvhash_idx; - - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh->pmap, pvh->va); - - pprevh = pvhash(pvhash_idx); - -#if PV_DEBUG - if (NULL == *pprevh) - panic("pvh_unlink null anchor"); /* JK DEBUG */ -#endif - curh = *pprevh; - - while (PV_HASHED_ENTRY_NULL != curh) { - if (pvh == curh) - break; - pprevh = &curh->nexth; - curh = curh->nexth; - } - if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); - *pprevh = pvh->nexth; - return; -} - -static inline void -pv_hash_add(pv_hashed_entry_t pvh_e, - pv_rooted_entry_t pv_h) -{ - pv_hashed_entry_t *hashp; - int pvhash_idx; - - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - insque(&pvh_e->qlink, &pv_h->qlink); - hashp = pvhash(pvhash_idx); -#if PV_DEBUG - if (NULL==hashp) - panic("pv_hash_add(%p) null hash bucket", pvh_e); -#endif - pvh_e->nexth = *hashp; - *hashp = pvh_e; - UNLOCK_PV_HASH(pvhash_idx); -} - -static inline void -pv_hash_remove(pv_hashed_entry_t pvh_e) -{ - int pvhash_idx; - - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); -} - -/* - * Remove pv list entry. - * Called with pv_head_table entry locked. - * Returns pv entry to be freed (or NULL). - */ -static inline pv_hashed_entry_t -pmap_pv_remove(pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t ppn) -{ - pv_hashed_entry_t pvh_e; - pv_rooted_entry_t pv_h; - pv_hashed_entry_t *pprevh; - int pvhash_idx; - uint32_t pv_cnt; - - pvh_e = PV_HASHED_ENTRY_NULL; - pv_h = pai_to_pvh(ppn_to_pai(ppn)); - if (pv_h->pmap == PMAP_NULL) - panic("pmap_pv_remove(%p,%llu,%u): null pv_list!", - pmap, vaddr, ppn); - - if (pv_h->va == vaddr && pv_h->pmap == pmap) { - /* - * Header is the pv_rooted_entry. - * We can't free that. If there is a queued - * entry after this one we remove that - * from the ppn queue, we remove it from the hash chain - * and copy it to the rooted entry. Then free it instead. - */ - pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); - if (pv_h != (pv_rooted_entry_t) pvh_e) { - /* - * Entry queued to root, remove this from hash - * and install as nem root. - */ - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); - LOCK_PV_HASH(pvhash_idx); - remque(&pvh_e->qlink); - pprevh = pvhash(pvhash_idx); - if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,%llu,%u): " - "empty hash, removing rooted", - pmap, vaddr, ppn); - } - pmap_pvh_unlink(pvh_e); - UNLOCK_PV_HASH(pvhash_idx); - pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; /* dispose of pvh_e */ - } else { - /* none queued after rooted */ - pv_h->pmap = PMAP_NULL; - pvh_e = PV_HASHED_ENTRY_NULL; - } - } else { - /* - * not removing rooted pv. find it on hash chain, remove from - * ppn queue and hash chain and free it - */ - CHK_NPVHASH(); - pvhash_idx = pvhashidx(pmap, vaddr); - LOCK_PV_HASH(pvhash_idx); - pprevh = pvhash(pvhash_idx); - if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,%llu,%u): empty hash", - pmap, vaddr, ppn); - } - pvh_e = *pprevh; - pmap_pv_hashlist_walks++; - pv_cnt = 0; - while (PV_HASHED_ENTRY_NULL != pvh_e) { - pv_cnt++; - if (pvh_e->pmap == pmap && - pvh_e->va == vaddr && - pvh_e->ppn == ppn) - break; - pprevh = &pvh_e->nexth; - pvh_e = pvh_e->nexth; - } - if (PV_HASHED_ENTRY_NULL == pvh_e) - panic("pmap_pv_remove(%p,%llu,%u): pv not on hash", - pmap, vaddr, ppn); - pmap_pv_hashlist_cnts += pv_cnt; - if (pmap_pv_hashlist_max < pv_cnt) - pmap_pv_hashlist_max = pv_cnt; - *pprevh = pvh_e->nexth; - remque(&pvh_e->qlink); - UNLOCK_PV_HASH(pvhash_idx); - } - - return pvh_e; -} - /* * for legacy, returns the address of the pde entry. * for 64 bit, causes the pdpt page containing the pde entry to be mapped, @@ -1463,147 +998,6 @@ pmap_reference(pmap_t p) } } -/* - * Remove a range of hardware page-table entries. - * The entries given are the first (inclusive) - * and last (exclusive) entries for the VM pages. - * The virtual address is the va for the first pte. - * - * The pmap must be locked. - * If the pmap is not the kernel pmap, the range must lie - * entirely within one pte-page. This is NOT checked. - * Assumes that the pte-page exists. - */ - -void -pmap_remove_range( - pmap_t pmap, - vm_map_offset_t start_vaddr, - pt_entry_t *spte, - pt_entry_t *epte) -{ - pt_entry_t *cpte; - pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_e; - int pvh_cnt = 0; - int num_removed, num_unwired, num_found; - int pai; - pmap_paddr_t pa; - vm_map_offset_t vaddr; - - num_removed = 0; - num_unwired = 0; - num_found = 0; - - /* invalidate the PTEs first to "freeze" them */ - for (cpte = spte, vaddr = start_vaddr; - cpte < epte; - cpte++, vaddr += PAGE_SIZE_64) { - - pa = pte_to_pa(*cpte); - if (pa == 0) - continue; - num_found++; - - if (iswired(*cpte)) - num_unwired++; - - pai = pa_index(pa); - - if (!IS_MANAGED_PAGE(pai)) { - /* - * Outside range of managed physical memory. - * Just remove the mappings. - */ - pmap_store_pte(cpte, 0); - continue; - } - - /* invalidate the PTE */ - pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); - } - - if (num_found == 0) { - /* nothing was changed: we're done */ - goto update_counts; - } - - /* propagate the invalidates to other CPUs */ - - PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr); - - for (cpte = spte, vaddr = start_vaddr; - cpte < epte; - cpte++, vaddr += PAGE_SIZE_64) { - - pa = pte_to_pa(*cpte); - if (pa == 0) - continue; - - pai = pa_index(pa); - - LOCK_PVH(pai); - - pa = pte_to_pa(*cpte); - if (pa == 0) { - UNLOCK_PVH(pai); - continue; - } - num_removed++; - - /* - * Get the modify and reference bits, then - * nuke the entry in the page table - */ - /* remember reference and change */ - pmap_phys_attributes[pai] |= - (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED)); - /* completely invalidate the PTE */ - pmap_store_pte(cpte, 0); - - /* - * Remove the mapping from the pvlist for this physical page. - */ - pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai); - - UNLOCK_PVH(pai); - - if (pvh_e != PV_HASHED_ENTRY_NULL) { - pvh_e->qlink.next = (queue_entry_t) pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) { - pvh_et = pvh_e; - } - pvh_cnt++; - } - } /* for loop */ - - if (pvh_eh != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); - } -update_counts: - /* - * Update the counts - */ -#if TESTING - if (pmap->stats.resident_count < num_removed) - panic("pmap_remove_range: resident_count"); -#endif - assert(pmap->stats.resident_count >= num_removed); - OSAddAtomic(-num_removed, &pmap->stats.resident_count); - -#if TESTING - if (pmap->stats.wired_count < num_unwired) - panic("pmap_remove_range: wired_count"); -#endif - assert(pmap->stats.wired_count >= num_unwired); - OSAddAtomic(-num_unwired, &pmap->stats.wired_count); - - return; -} - /* * Remove phys addr if mapped in specified map * @@ -1618,274 +1012,6 @@ pmap_remove_some_phys( } -/* - * Remove the given range of addresses - * from the specified map. - * - * It is assumed that the start and end are properly - * rounded to the hardware page size. - */ -void -pmap_remove( - pmap_t map, - addr64_t s64, - addr64_t e64) -{ - pt_entry_t *pde; - pt_entry_t *spte, *epte; - addr64_t l64; - uint64_t deadline; - - pmap_intr_assert(); - - if (map == PMAP_NULL || s64 == e64) - return; - - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, - map, - (uint32_t) (s64 >> 32), s64, - (uint32_t) (e64 >> 32), e64); - - - PMAP_LOCK(map); - -#if 0 - /* - * Check that address range in the kernel does not overlap the stacks. - * We initialize local static min/max variables once to avoid making - * 2 function calls for every remove. Note also that these functions - * both return 0 before kernel stacks have been initialized, and hence - * the panic is not triggered in this case. - */ - if (map == kernel_pmap) { - static vm_offset_t kernel_stack_min = 0; - static vm_offset_t kernel_stack_max = 0; - - if (kernel_stack_min == 0) { - kernel_stack_min = min_valid_stack_address(); - kernel_stack_max = max_valid_stack_address(); - } - if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) || - (kernel_stack_min < e64 && e64 <= kernel_stack_max)) - panic("pmap_remove() attempted in kernel stack"); - } -#else - - /* - * The values of kernel_stack_min and kernel_stack_max are no longer - * relevant now that we allocate kernel stacks in the kernel map, - * so the old code above no longer applies. If we wanted to check that - * we weren't removing a mapping of a page in a kernel stack we'd - * mark the PTE with an unused bit and check that here. - */ - -#endif - - deadline = rdtsc64() + max_preemption_latency_tsc; - - while (s64 < e64) { - l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1); - if (l64 > e64) - l64 = e64; - pde = pmap_pde(map, s64); - - if (pde && (*pde & INTEL_PTE_VALID)) { - if (*pde & INTEL_PTE_PS) { - /* - * If we're removing a superpage, pmap_remove_range() - * must work on level 2 instead of level 1; and we're - * only passing a single level 2 entry instead of a - * level 1 range. - */ - spte = pde; - epte = spte+1; /* excluded */ - } else { - spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1))); - spte = &spte[ptenum(s64)]; - epte = &spte[intel_btop(l64 - s64)]; - } - pmap_remove_range(map, s64, spte, epte); - } - s64 = l64; - pde++; - - if (s64 < e64 && rdtsc64() >= deadline) { - PMAP_UNLOCK(map) - PMAP_LOCK(map) - deadline = rdtsc64() + max_preemption_latency_tsc; - } - } - - PMAP_UNLOCK(map); - - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END, - map, 0, 0, 0, 0); - -} - -/* - * Routine: pmap_page_protect - * - * Function: - * Lower the permission for all mappings to a given - * page. - */ -void -pmap_page_protect( - ppnum_t pn, - vm_prot_t prot) -{ - pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL; - pv_hashed_entry_t nexth; - int pvh_cnt = 0; - pv_rooted_entry_t pv_h; - pv_rooted_entry_t pv_e; - pv_hashed_entry_t pvh_e; - pt_entry_t *pte; - int pai; - pmap_t pmap; - boolean_t remove; - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pn == vm_page_guard_addr) - return; - - pai = ppn_to_pai(pn); - - if (!IS_MANAGED_PAGE(pai)) { - /* - * Not a managed page. - */ - return; - } - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, - pn, prot, 0, 0, 0); - - /* - * Determine the new protection. - */ - switch (prot) { - case VM_PROT_READ: - case VM_PROT_READ | VM_PROT_EXECUTE: - remove = FALSE; - break; - case VM_PROT_ALL: - return; /* nothing to do */ - default: - remove = TRUE; - break; - } - - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - - /* - * Walk down PV list, if any, changing or removing all mappings. - */ - if (pv_h->pmap == PMAP_NULL) - goto done; - - pv_e = pv_h; - pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */ - - do { - vm_map_offset_t vaddr; - - pmap = pv_e->pmap; - vaddr = pv_e->va; - pte = pmap_pte(pmap, vaddr); - if (0 == pte) { - panic("pmap_page_protect() " - "pmap=%p pn=0x%x vaddr=0x%llx\n", - pmap, pn, vaddr); - } - nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink); - - /* - * Remove the mapping if new protection is NONE - * or if write-protecting a kernel mapping. - */ - if (remove || pmap == kernel_pmap) { - /* - * Remove the mapping, collecting dirty bits. - */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); - pmap_phys_attributes[pai] |= - *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - pmap_store_pte(pte, 0); - -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_page_protect: resident_count"); -#endif - assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, &pmap->stats.resident_count); - - /* - * Deal with the pv_rooted_entry. - */ - - if (pv_e == pv_h) { - /* - * Fix up head later. - */ - pv_h->pmap = PMAP_NULL; - } else { - /* - * Delete this entry. - */ - pv_hash_remove(pvh_e); - pvh_e->qlink.next = (queue_entry_t) pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pvh_cnt++; - } - } else { - /* - * Write-protect. - */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); - } - pvh_e = nexth; - } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h); - - - /* - * If pv_head mapping was removed, fix it up. - */ - if (pv_h->pmap == PMAP_NULL) { - pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); - - if (pvh_e != (pv_hashed_entry_t) pv_h) { - pv_hash_remove(pvh_e); - pv_h->pmap = pvh_e->pmap; - pv_h->va = pvh_e->va; - pvh_e->qlink.next = (queue_entry_t) pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pvh_cnt++; - } - } - if (pvh_eh != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt); - } -done: - UNLOCK_PVH(pai); - - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END, - 0, 0, 0, 0, 0); -} - /* * Routine: @@ -2019,398 +1145,6 @@ pmap_map_block( } } - -/* - * Insert the given physical page (p) at - * the specified virtual address (v) in the - * target physical map with the protection requested. - * - * If specified, the page will be wired down, meaning - * that the related pte cannot be reclaimed. - * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. - */ -void -pmap_enter( - register pmap_t pmap, - vm_map_offset_t vaddr, - ppnum_t pn, - vm_prot_t prot, - unsigned int flags, - boolean_t wired) -{ - pt_entry_t *pte; - pv_rooted_entry_t pv_h; - int pai; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_new; - pt_entry_t template; - pmap_paddr_t old_pa; - pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn); - boolean_t need_tlbflush = FALSE; - boolean_t set_NX; - char oattr; - boolean_t old_pa_locked; - boolean_t superpage = flags & VM_MEM_SUPERPAGE; - vm_object_t delpage_pm_obj = NULL; - int delpage_pde_index = 0; - - - pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); - if (pmap_debug) - kprintf("pmap_enter(%p,%llu,%u)\n", pmap, vaddr, pn); - if (pmap == PMAP_NULL) - return; - if (pn == vm_page_guard_addr) - return; - - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, - pmap, - (uint32_t) (vaddr >> 32), (uint32_t) vaddr, - pn, prot); - - if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) - set_NX = FALSE; - else - set_NX = TRUE; - - /* - * Must allocate a new pvlist entry while we're unlocked; - * zalloc may cause pageout (which will lock the pmap system). - * If we determine we need a pvlist entry, we will unlock - * and allocate one. Then we will retry, throughing away - * the allocated entry later (if we no longer need it). - */ - - pvh_new = PV_HASHED_ENTRY_NULL; -Retry: - pvh_e = PV_HASHED_ENTRY_NULL; - - PMAP_LOCK(pmap); - - /* - * Expand pmap to include this pte. Assume that - * pmap is always expanded to include enough hardware - * pages to map one VM page. - */ - if(superpage) { - while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { - /* need room for another pde entry */ - PMAP_UNLOCK(pmap); - pmap_expand_pdpt(pmap, vaddr); - PMAP_LOCK(pmap); - } - } else { - while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) { - /* - * Must unlock to expand the pmap - * going to grow pde level page(s) - */ - PMAP_UNLOCK(pmap); - pmap_expand(pmap, vaddr); - PMAP_LOCK(pmap); - } - } - - if (superpage && *pte && !(*pte & INTEL_PTE_PS)) { - /* - * There is still an empty page table mapped that - * was used for a previous base page mapping. - * Remember the PDE and the PDE index, so that we - * can free the page at the end of this function. - */ - delpage_pde_index = (int)pdeidx(pmap, vaddr); - delpage_pm_obj = pmap->pm_obj; - *pte = 0; - } - - old_pa = pte_to_pa(*pte); - pai = pa_index(old_pa); - old_pa_locked = FALSE; - - /* - * if we have a previous managed page, lock the pv entry now. after - * we lock it, check to see if someone beat us to the lock and if so - * drop the lock - */ - if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) { - LOCK_PVH(pai); - old_pa_locked = TRUE; - old_pa = pte_to_pa(*pte); - if (0 == old_pa) { - UNLOCK_PVH(pai); /* another path beat us to it */ - old_pa_locked = FALSE; - } - } - - /* - * Special case if the incoming physical page is already mapped - * at this address. - */ - if (old_pa == pa) { - - /* - * May be changing its wired attribute or protection - */ - - template = pa_to_pte(pa) | INTEL_PTE_VALID; - - if (VM_MEM_NOT_CACHEABLE == - (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) { - if (!(flags & VM_MEM_GUARDED)) - template |= INTEL_PTE_PTA; - template |= INTEL_PTE_NCACHE; - } - if (pmap != kernel_pmap) - template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; - - if (set_NX) - template |= INTEL_PTE_NX; - - if (wired) { - template |= INTEL_PTE_WIRED; - if (!iswired(*pte)) - OSAddAtomic(+1, - &pmap->stats.wired_count); - } else { - if (iswired(*pte)) { - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); - } - } - if (superpage) /* this path can not be used */ - template |= INTEL_PTE_PS; /* to change the page size! */ - - /* store modified PTE and preserve RC bits */ - pmap_update_pte(pte, *pte, - template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); - if (old_pa_locked) { - UNLOCK_PVH(pai); - old_pa_locked = FALSE; - } - need_tlbflush = TRUE; - goto Done; - } - - /* - * Outline of code from here: - * 1) If va was mapped, update TLBs, remove the mapping - * and remove old pvlist entry. - * 2) Add pvlist entry for new mapping - * 3) Enter new mapping. - * - * If the old physical page is not managed step 1) is skipped - * (except for updating the TLBs), and the mapping is - * overwritten at step 3). If the new physical page is not - * managed, step 2) is skipped. - */ - - if (old_pa != (pmap_paddr_t) 0) { - - /* - * Don't do anything to pages outside valid memory here. - * Instead convince the code that enters a new mapping - * to overwrite the old one. - */ - - /* invalidate the PTE */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); - /* propagate invalidate everywhere */ - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - /* remember reference and change */ - oattr = (char) (*pte & (PHYS_MODIFIED | PHYS_REFERENCED)); - /* completely invalidate the PTE */ - pmap_store_pte(pte, 0); - - if (IS_MANAGED_PAGE(pai)) { -#if TESTING - if (pmap->stats.resident_count < 1) - panic("pmap_enter: resident_count"); -#endif - assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, - &pmap->stats.resident_count); - - if (iswired(*pte)) { -#if TESTING - if (pmap->stats.wired_count < 1) - panic("pmap_enter: wired_count"); -#endif - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); - } - pmap_phys_attributes[pai] |= oattr; - - /* - * Remove the mapping from the pvlist for - * this physical page. - * We'll end up with either a rooted pv or a - * hashed pv - */ - pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai); - - } else { - - /* - * old_pa is not managed. - * Do removal part of accounting. - */ - - if (iswired(*pte)) { - assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); - } - } - } - - /* - * if we had a previously managed paged locked, unlock it now - */ - if (old_pa_locked) { - UNLOCK_PVH(pai); - old_pa_locked = FALSE; - } - - pai = pa_index(pa); /* now working with new incoming phys page */ - if (IS_MANAGED_PAGE(pai)) { - - /* - * Step 2) Enter the mapping in the PV list for this - * physical page. - */ - pv_h = pai_to_pvh(pai); - - LOCK_PVH(pai); - - if (pv_h->pmap == PMAP_NULL) { - /* - * No mappings yet, use rooted pv - */ - pv_h->va = vaddr; - pv_h->pmap = pmap; - queue_init(&pv_h->qlink); - } else { - /* - * Add new pv_hashed_entry after header. - */ - if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) { - pvh_e = pvh_new; - pvh_new = PV_HASHED_ENTRY_NULL; - } else if (PV_HASHED_ENTRY_NULL == pvh_e) { - PV_HASHED_ALLOC(pvh_e); - if (PV_HASHED_ENTRY_NULL == pvh_e) { - /* - * the pv list is empty. if we are on - * the kernel pmap we'll use one of - * the special private kernel pv_e's, - * else, we need to unlock - * everything, zalloc a pv_e, and - * restart bringing in the pv_e with - * us. - */ - if (kernel_pmap == pmap) { - PV_HASHED_KERN_ALLOC(pvh_e); - } else { - UNLOCK_PVH(pai); - PMAP_UNLOCK(pmap); - pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - goto Retry; - } - } - } - if (PV_HASHED_ENTRY_NULL == pvh_e) - panic("pvh_e exhaustion"); - - pvh_e->va = vaddr; - pvh_e->pmap = pmap; - pvh_e->ppn = pn; - pv_hash_add(pvh_e, pv_h); - - /* - * Remember that we used the pvlist entry. - */ - pvh_e = PV_HASHED_ENTRY_NULL; - } - - /* - * only count the mapping - * for 'managed memory' - */ - OSAddAtomic(+1, & pmap->stats.resident_count); - if (pmap->stats.resident_count > pmap->stats.resident_max) { - pmap->stats.resident_max = pmap->stats.resident_count; - } - } - /* - * Step 3) Enter the mapping. - * - * Build a template to speed up entering - - * only the pfn changes. - */ - template = pa_to_pte(pa) | INTEL_PTE_VALID; - - if (flags & VM_MEM_NOT_CACHEABLE) { - if (!(flags & VM_MEM_GUARDED)) - template |= INTEL_PTE_PTA; - template |= INTEL_PTE_NCACHE; - } - if (pmap != kernel_pmap) - template |= INTEL_PTE_USER; - if (prot & VM_PROT_WRITE) - template |= INTEL_PTE_WRITE; - if (set_NX) - template |= INTEL_PTE_NX; - if (wired) { - template |= INTEL_PTE_WIRED; - OSAddAtomic(+1, & pmap->stats.wired_count); - } - if (superpage) - template |= INTEL_PTE_PS; - pmap_store_pte(pte, template); - - /* - * if this was a managed page we delayed unlocking the pv until here - * to prevent pmap_page_protect et al from finding it until the pte - * has been stored - */ - if (IS_MANAGED_PAGE(pai)) { - UNLOCK_PVH(pai); - } -Done: - if (need_tlbflush == TRUE) - PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); - - if (pvh_e != PV_HASHED_ENTRY_NULL) { - PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1); - } - if (pvh_new != PV_HASHED_ENTRY_NULL) { - PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1); - } - PMAP_UNLOCK(pmap); - - if (delpage_pm_obj) { - vm_page_t m; - - vm_object_lock(delpage_pm_obj); - m = vm_page_lookup(delpage_pm_obj, delpage_pde_index); - if (m == VM_PAGE_NULL) - panic("pmap_enter: pte page not in object"); - VM_PAGE_FREE(m); - OSAddAtomic(-1, &inuse_ptepages_count); - vm_object_unlock(delpage_pm_obj); - } - - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); -} - /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address @@ -3341,96 +2075,6 @@ phys_page_exists(ppnum_t pn) return TRUE; } -void -mapping_free_prime(void) -{ - int i; - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - -} - -void -mapping_adjust(void) -{ - pv_hashed_entry_t pvh_e; - pv_hashed_entry_t pvh_eh; - pv_hashed_entry_t pvh_et; - int pv_cnt; - int i; - - if (mapping_adjust_call == NULL) { - thread_call_setup(&mapping_adjust_call_data, - (thread_call_func_t) mapping_adjust, - (thread_call_param_t) NULL); - mapping_adjust_call = &mapping_adjust_call_data; - } - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } - - pv_cnt = 0; - pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL; - if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) { - for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) { - pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone); - - pvh_e->qlink.next = (queue_entry_t)pvh_eh; - pvh_eh = pvh_e; - - if (pvh_et == PV_HASHED_ENTRY_NULL) - pvh_et = pvh_e; - pv_cnt++; - } - PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt); - } - mappingrecurse = 0; -} - - void pmap_switch(pmap_t tpmap) { diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index 4c5b5f07d..0e8749b2b 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -260,3 +260,9 @@ getval( *val = 1; return (NUM); } + +boolean_t +PE_imgsrc_mount_supported() +{ + return TRUE; +} diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index 614382096..5a8fa5eff 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -184,15 +184,15 @@ void PE_init_platform(boolean_t vm_initialized, void * _args) /* Hack! FIXME.. */ outb(0x21, 0xff); /* Maskout all interrupts Pic1 */ outb(0xa1, 0xff); /* Maskout all interrupts Pic2 */ - if (PE_state.deviceTreeHead) { DTInit(PE_state.deviceTreeHead); - } + } pe_identify_machine(args); } else { pe_init_debug(); } + } void PE_create_console( void ) @@ -274,5 +274,8 @@ PE_stub_poll_input(__unused unsigned int options, char * c) int (*PE_poll_input)(unsigned int options, char * c) = PE_stub_poll_input; - - +boolean_t +PE_reboot_on_panic(void) +{ + return FALSE; +} diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index 3dd73dad7..7c3596f9f 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -189,6 +189,8 @@ typedef struct PE_Video PE_Video; extern void initialize_screen(PE_Video *, unsigned int); +extern void dim_screen(void); + extern int PE_current_console( PE_Video *info); @@ -275,6 +277,12 @@ extern void PE_cpu_machine_quiesce( extern void pe_init_debug(void); +extern boolean_t PE_imgsrc_mount_supported(void); + + +#if KERNEL_PRIVATE +boolean_t PE_reboot_on_panic(void); +#endif __END_DECLS